llama_cpp 0.16.0 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/ext/llama_cpp/extconf.rb +3 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +14 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +4 -0
  7. data/vendor/tmp/llama.cpp/Makefile +119 -54
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
  126. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  127. data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
  128. data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
  129. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
  130. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
  131. data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
  132. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
  133. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
  134. data/vendor/tmp/llama.cpp/ggml.c +158 -414
  135. data/vendor/tmp/llama.cpp/ggml.h +6 -0
  136. data/vendor/tmp/llama.cpp/llama.cpp +628 -279
  137. data/vendor/tmp/llama.cpp/llama.h +9 -1
  138. data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
  139. data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
  140. data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
  141. data/vendor/tmp/llama.cpp/unicode.h +1 -1
  142. metadata +15 -3
@@ -21,6 +21,10 @@
21
21
  # include "ggml-kompute.h"
22
22
  #endif
23
23
 
24
+ #ifdef GGML_USE_BLAS
25
+ # include "ggml-blas.h"
26
+ #endif
27
+
24
28
  #ifdef GGML_USE_METAL
25
29
  # include "ggml-metal.h"
26
30
  #endif
@@ -282,6 +286,7 @@ enum llm_kv {
282
286
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
283
287
  LLM_KV_FEED_FORWARD_LENGTH,
284
288
  LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
289
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
285
290
  LLM_KV_USE_PARALLEL_RESIDUAL,
286
291
  LLM_KV_TENSOR_DATA_LAYOUT,
287
292
  LLM_KV_EXPERT_COUNT,
@@ -360,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
360
365
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
361
366
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
362
367
 
363
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
364
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
365
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
366
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
367
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
368
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
369
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
370
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
371
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
372
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
373
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
374
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
375
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
376
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
377
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
368
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
369
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
370
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
371
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
372
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
373
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
374
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
375
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
376
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
377
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
378
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
379
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
380
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
381
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
382
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
383
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
378
384
 
379
385
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
380
386
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -704,6 +710,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
704
710
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
705
711
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
706
712
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
713
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
707
714
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
708
715
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
709
716
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -1273,6 +1280,126 @@ struct no_init {
1273
1280
  };
1274
1281
 
1275
1282
  struct llama_file {
1283
+
1284
+ #if defined(_WIN32)
1285
+ // use FILE * so we don't have to re-open the file to mmap
1286
+ FILE * fp;
1287
+ HANDLE fp_win32;
1288
+ size_t size;
1289
+
1290
+ private:
1291
+ std::string GetErrorMessageWin32(DWORD error_code) const {
1292
+ std::string ret;
1293
+ LPSTR lpMsgBuf = NULL;
1294
+ DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1295
+ NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1296
+ if (!bufLen) {
1297
+ ret = format("Win32 error code: %s", error_code);
1298
+ } else {
1299
+ ret = lpMsgBuf;
1300
+ LocalFree(lpMsgBuf);
1301
+ }
1302
+
1303
+ return ret;
1304
+ }
1305
+
1306
+ public:
1307
+
1308
+ llama_file(const char * fname, const char * mode) {
1309
+ fp = ggml_fopen(fname, mode);
1310
+ if (fp == NULL) {
1311
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1312
+ }
1313
+ fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
1314
+ seek(0, SEEK_END);
1315
+ size = tell();
1316
+ seek(0, SEEK_SET);
1317
+ }
1318
+
1319
+ size_t tell() const {
1320
+ // SetFilePointerEx returns the current position when seeking relative 0 bytes
1321
+ LARGE_INTEGER li;
1322
+ li.QuadPart = 0;
1323
+ BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
1324
+ if (!ret) {
1325
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1326
+ }
1327
+
1328
+ return li.QuadPart;
1329
+ }
1330
+
1331
+ void seek(size_t offset, int whence) const {
1332
+ // no need to convert SEEK_* to FILE_*. The enums are the same.
1333
+ // Still, keep static asserts to avoid failures in the future.
1334
+ static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
1335
+ static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
1336
+ static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
1337
+
1338
+ LARGE_INTEGER li;
1339
+ li.QuadPart = offset;
1340
+ BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
1341
+ if (!ret) {
1342
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1343
+ }
1344
+ }
1345
+
1346
+ void read_raw(void * ptr, size_t len) const {
1347
+ // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
1348
+ // use the Win32 API to do file io instead of the C/C++ library functions.
1349
+
1350
+ // There are conditions under which ReadFile cannot read chunks >64MB.
1351
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1352
+ size_t bytes_read = 0;
1353
+ while (bytes_read < len) {
1354
+ size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
1355
+ DWORD chunk_read = 0;
1356
+ BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
1357
+ if (!result) {
1358
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1359
+ }
1360
+ if (chunk_read < chunk_size || chunk_read == 0) {
1361
+ throw std::runtime_error("unexpectedly reached end of file");
1362
+ }
1363
+
1364
+ bytes_read += chunk_read;
1365
+ } ;
1366
+ }
1367
+
1368
+ uint32_t read_u32() const {
1369
+ uint32_t val;
1370
+ read_raw(&val, sizeof(val));
1371
+ return val;
1372
+ }
1373
+
1374
+ void write_raw(const void * ptr, size_t len) const {
1375
+ // There are conditions under which WriteFile cannot write chunks >64MB.
1376
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1377
+ size_t bytes_written = 0;
1378
+ while (bytes_written < len) {
1379
+ size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
1380
+ DWORD chunk_written = 0;
1381
+ BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
1382
+ if (!result) {
1383
+ throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1384
+ }
1385
+ if (chunk_written < chunk_size || chunk_written == 0) {
1386
+ throw std::runtime_error("unexpectedly failed to write bytes");
1387
+ }
1388
+
1389
+ bytes_written += chunk_written;
1390
+ }
1391
+ }
1392
+
1393
+ void write_u32(std::uint32_t val) const {
1394
+ write_raw(&val, sizeof(val));
1395
+ }
1396
+
1397
+ ~llama_file() {
1398
+ if (fp) {
1399
+ std::fclose(fp);
1400
+ }
1401
+ }
1402
+ #else
1276
1403
  // use FILE * so we don't have to re-open the file to mmap
1277
1404
  FILE * fp;
1278
1405
  size_t size;
@@ -1293,7 +1420,10 @@ struct llama_file {
1293
1420
  #else
1294
1421
  long ret = std::ftell(fp);
1295
1422
  #endif
1296
- GGML_ASSERT(ret != -1); // this really shouldn't fail
1423
+ if (ret == -1) {
1424
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
1425
+ }
1426
+
1297
1427
  return (size_t) ret;
1298
1428
  }
1299
1429
 
@@ -1303,7 +1433,9 @@ struct llama_file {
1303
1433
  #else
1304
1434
  int ret = std::fseek(fp, (long) offset, whence);
1305
1435
  #endif
1306
- GGML_ASSERT(ret == 0); // same
1436
+ if (ret != 0) {
1437
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
1438
+ }
1307
1439
  }
1308
1440
 
1309
1441
  void read_raw(void * ptr, size_t len) const {
@@ -1346,6 +1478,7 @@ struct llama_file {
1346
1478
  std::fclose(fp);
1347
1479
  }
1348
1480
  }
1481
+ #endif
1349
1482
  };
1350
1483
  using llama_files = std::vector<std::unique_ptr<llama_file>>;
1351
1484
 
@@ -1839,6 +1972,7 @@ struct llama_hparams {
1839
1972
  uint32_t n_lora_q = 0;
1840
1973
  uint32_t n_lora_kv = 0;
1841
1974
  uint32_t n_ff_exp = 0;
1975
+ uint32_t n_ff_shexp = 0;
1842
1976
  uint32_t n_expert_shared = 0;
1843
1977
  float expert_weights_scale = 0.0;
1844
1978
 
@@ -1887,6 +2021,7 @@ struct llama_hparams {
1887
2021
  if (this->n_lora_q != other.n_lora_q) return true;
1888
2022
  if (this->n_lora_kv != other.n_lora_kv) return true;
1889
2023
  if (this->n_ff_exp != other.n_ff_exp) return true;
2024
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
1890
2025
  if (this->n_expert_shared != other.n_expert_shared) return true;
1891
2026
 
1892
2027
  if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -2158,6 +2293,8 @@ struct llama_vocab {
2158
2293
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2159
2294
  enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2160
2295
 
2296
+ int max_token_len = 0; // used for optimizing longest token search
2297
+
2161
2298
  std::unordered_map<token, id> token_to_id;
2162
2299
  std::vector<token_data> id_to_token;
2163
2300
 
@@ -2175,16 +2312,17 @@ struct llama_vocab {
2175
2312
  id special_cls_id = -1;
2176
2313
  id special_mask_id = -1;
2177
2314
 
2178
- int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2179
- int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2180
-
2181
2315
  id linefeed_id = 13;
2182
2316
  id special_prefix_id = -1;
2183
2317
  id special_suffix_id = -1;
2184
2318
  id special_middle_id = -1;
2185
2319
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2186
2320
 
2187
- bool add_space_prefix = true;
2321
+ // tokenizer flags
2322
+ bool tokenizer_add_space_prefix = true;
2323
+ bool tokenizer_add_bos = false;
2324
+ bool tokenizer_add_eos = false;
2325
+ bool tokenizer_ignore_merges = false;
2188
2326
 
2189
2327
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
2190
2328
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
@@ -2298,9 +2436,13 @@ struct llama_context {
2298
2436
  std::vector<ggml_backend_t> backends;
2299
2437
  #ifdef GGML_USE_METAL
2300
2438
  ggml_backend_t backend_metal = nullptr;
2439
+ #endif
2440
+ #ifdef GGML_USE_BLAS
2441
+ ggml_backend_t backend_blas = nullptr;
2301
2442
  #endif
2302
2443
  ggml_backend_t backend_cpu = nullptr;
2303
2444
 
2445
+
2304
2446
  const llama_model & model;
2305
2447
 
2306
2448
  // key + value cache for the self attention
@@ -3712,6 +3854,44 @@ struct llama_model_loader {
3712
3854
  std::vector<no_init<uint8_t>> read_buf;
3713
3855
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3714
3856
 
3857
+ #if defined(GGML_USE_CUDA)
3858
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
3859
+ // NVMe raid configurations might require more / larger buffers.
3860
+ constexpr size_t num_buffers = 4;
3861
+ constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
3862
+
3863
+ std::vector<ggml_backend_buffer_t> host_buffers;
3864
+ std::vector<void*> host_ptrs;
3865
+ std::vector<ggml_backend_event_t> events;
3866
+ size_t buffer_idx = 0; // buffer to use for async loads
3867
+
3868
+ ggml_backend_t cuda_backend = nullptr;
3869
+ if (!use_mmap && !check_tensors) {
3870
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
3871
+ // First determine if the CUDA backend is active, and if so, determine the device ID.
3872
+ ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
3873
+ if (buf) {
3874
+ ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
3875
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
3876
+ auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
3877
+ if (buffer_type == cuda_buffer_type) {
3878
+ cuda_backend = ggml_backend_cuda_init(i);
3879
+ break;
3880
+ }
3881
+ }
3882
+ }
3883
+
3884
+ // If the cuda backend is active create pinned memory buffers and events for synchronisation.
3885
+ if (cuda_backend) {
3886
+ for (size_t idx = 0; idx < num_buffers; ++idx) {
3887
+ host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
3888
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
3889
+ events.emplace_back(ggml_backend_event_new(cuda_backend));
3890
+ }
3891
+ }
3892
+ }
3893
+ #endif
3894
+
3715
3895
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3716
3896
  const auto * weight = get_weight(ggml_get_name(cur));
3717
3897
  if (weight == nullptr) {
@@ -3767,12 +3947,36 @@ struct llama_model_loader {
3767
3947
  }));
3768
3948
  }
3769
3949
  } else {
3770
- read_buf.resize(n_size);
3771
- file->seek(weight->offs, SEEK_SET);
3772
- file->read_raw(read_buf.data(), n_size);
3773
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3774
- if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3775
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3950
+ #if defined(GGML_USE_CUDA)
3951
+ // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
3952
+ if (cuda_backend) {
3953
+ file->seek(weight->offs, SEEK_SET);
3954
+
3955
+ size_t bytes_read = 0;
3956
+
3957
+ while (bytes_read < n_size) {
3958
+ size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
3959
+
3960
+ ggml_backend_event_synchronize(events[buffer_idx]);
3961
+ file->read_raw(host_ptrs[buffer_idx], read_iteration);
3962
+ ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
3963
+ ggml_backend_event_record(events[buffer_idx]);
3964
+
3965
+ bytes_read += read_iteration;
3966
+ ++buffer_idx;
3967
+ buffer_idx %= num_buffers;
3968
+ }
3969
+ }
3970
+ else
3971
+ #endif
3972
+ {
3973
+ read_buf.resize(n_size);
3974
+ file->seek(weight->offs, SEEK_SET);
3975
+ file->read_raw(read_buf.data(), n_size);
3976
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3977
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3978
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3979
+ }
3776
3980
  }
3777
3981
  }
3778
3982
  }
@@ -3780,6 +3984,18 @@ struct llama_model_loader {
3780
3984
  size_done += n_size;
3781
3985
  }
3782
3986
 
3987
+ #if defined(GGML_USE_CUDA)
3988
+ // free temporary resources used for async cuda uploads
3989
+ if (cuda_backend) {
3990
+ for (size_t idx = 0; idx < num_buffers;++idx) {
3991
+ ggml_backend_event_synchronize(events[idx]);
3992
+ ggml_backend_event_free(events[idx]);
3993
+ ggml_backend_buffer_free(host_buffers[idx]);
3994
+ }
3995
+ ggml_backend_free(cuda_backend);
3996
+ }
3997
+ #endif
3998
+
3783
3999
  // check validation results
3784
4000
  bool validation_failed = false;
3785
4001
  for (auto & future : validation_result) {
@@ -4246,6 +4462,9 @@ static void llm_load_hparams(
4246
4462
  } break;
4247
4463
  case LLM_ARCH_QWEN2MOE:
4248
4464
  {
4465
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4466
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4467
+
4249
4468
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4250
4469
  switch (hparams.n_layer) {
4251
4470
  case 24: model.type = e_model::MODEL_A2_7B; break;
@@ -4552,38 +4771,9 @@ static void llm_load_vocab(
4552
4771
  vocab.special_cls_id = -1;
4553
4772
  vocab.special_mask_id = -1;
4554
4773
 
4555
- // For Fill-In-the-Middle (FIM)/infill models which where converted
4556
- // prior to support of FIM special tokens in GGUF, the following
4557
- // will allow those models to continue to work. The general names
4558
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4559
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4560
- // new versions of these models have been published.
4561
- std::string gen_name;
4562
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4563
-
4564
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4565
- [](unsigned char c){ return std::tolower(c); });
4566
-
4567
- if (gen_name.find("code") != std::string::npos) {
4568
- if (model.arch == LLM_ARCH_LLAMA) {
4569
- vocab.special_prefix_id = 32007;
4570
- vocab.special_suffix_id = 32008;
4571
- vocab.special_middle_id = 32009;
4572
- vocab.special_eot_id = 32010;
4573
- } else if (model.arch == LLM_ARCH_GEMMA) {
4574
- vocab.special_prefix_id = 67;
4575
- vocab.special_suffix_id = 69;
4576
- vocab.special_middle_id = 68;
4577
- // TODO: this is not EOT, it is "file separator" token, needs fix
4578
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4579
- //vocab.special_eot_id = 70;
4580
- vocab.special_eot_id = 107;
4581
- }
4582
- }
4583
-
4584
4774
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4585
4775
  if (add_space_prefix_keyidx != -1) {
4586
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4776
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4587
4777
  } // The default value of add_space_prefix is true.
4588
4778
  } else if (tokenizer_model == "bert") {
4589
4779
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
@@ -4596,13 +4786,13 @@ static void llm_load_vocab(
4596
4786
  vocab.special_pad_id = 0;
4597
4787
  vocab.special_cls_id = 101;
4598
4788
  vocab.special_mask_id = 103;
4599
- vocab.add_space_prefix = false;
4789
+ vocab.tokenizer_add_space_prefix = false;
4600
4790
  } else if (tokenizer_model == "gpt2") {
4601
4791
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
4602
4792
 
4603
4793
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4604
4794
  if (add_space_prefix_keyidx != -1) {
4605
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4795
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4606
4796
  }
4607
4797
 
4608
4798
  // read bpe merges and populate bpe ranks
@@ -4653,14 +4843,15 @@ static void llm_load_vocab(
4653
4843
  LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4654
4844
  LLAMA_LOG_WARN("%s: \n", __func__);
4655
4845
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4656
- } else if (
4657
- tokenizer_pre == "default") {
4846
+ } else if (tokenizer_pre == "default") {
4658
4847
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4659
4848
  } else if (
4660
4849
  tokenizer_pre == "llama3" ||
4661
4850
  tokenizer_pre == "llama-v3" ||
4662
4851
  tokenizer_pre == "llama-bpe") {
4663
4852
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4853
+ vocab.tokenizer_ignore_merges = true;
4854
+ vocab.tokenizer_add_bos = true;
4664
4855
  } else if (
4665
4856
  tokenizer_pre == "deepseek-llm") {
4666
4857
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
@@ -4681,7 +4872,8 @@ static void llm_load_vocab(
4681
4872
  tokenizer_pre == "jina-es" ||
4682
4873
  tokenizer_pre == "jina-de" ||
4683
4874
  tokenizer_pre == "jina-v2-es" ||
4684
- tokenizer_pre == "jina-v2-de") {
4875
+ tokenizer_pre == "jina-v2-de" ||
4876
+ tokenizer_pre == "jina-v2-code") {
4685
4877
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4686
4878
  } else if (
4687
4879
  tokenizer_pre == "refact") {
@@ -4704,9 +4896,20 @@ static void llm_load_vocab(
4704
4896
  } else if (
4705
4897
  tokenizer_pre == "smaug-bpe") {
4706
4898
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4899
+ } else if (
4900
+ tokenizer_pre == "poro-chat") {
4901
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
4707
4902
  } else {
4708
4903
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4709
4904
  }
4905
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4906
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4907
+ vocab.tokenizer_add_bos = true;
4908
+ vocab.tokenizer_add_eos = false;
4909
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
4910
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4911
+ vocab.tokenizer_add_bos = true;
4912
+ vocab.tokenizer_add_eos = false;
4710
4913
  } else {
4711
4914
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4712
4915
  }
@@ -4738,6 +4941,7 @@ static void llm_load_vocab(
4738
4941
  GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
4739
4942
 
4740
4943
  vocab.token_to_id[word] = i;
4944
+ vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
4741
4945
 
4742
4946
  auto & token_data = vocab.id_to_token[i];
4743
4947
  token_data.text = std::move(word);
@@ -4761,6 +4965,45 @@ static void llm_load_vocab(
4761
4965
 
4762
4966
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
4763
4967
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4968
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4969
+ // prior to support of FIM special tokens in GGUF, the following
4970
+ // will allow those models to continue to work. The general names
4971
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4972
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4973
+ // new versions of these models have been published.
4974
+ std::string gen_name;
4975
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4976
+
4977
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4978
+ [](unsigned char c){ return std::tolower(c); });
4979
+
4980
+ if (gen_name.find("code") != std::string::npos) {
4981
+ if (model.arch == LLM_ARCH_LLAMA
4982
+ && 32010 < vocab.id_to_token.size()
4983
+ && vocab.id_to_token[32007].text == "<PRE>"
4984
+ && vocab.id_to_token[32008].text == "<SUF>"
4985
+ && vocab.id_to_token[32009].text == "<MID>"
4986
+ && vocab.id_to_token[32010].text == "<EOT>") {
4987
+ vocab.special_prefix_id = 32007;
4988
+ vocab.special_suffix_id = 32008;
4989
+ vocab.special_middle_id = 32009;
4990
+ vocab.special_eot_id = 32010;
4991
+ } else if (model.arch == LLM_ARCH_GEMMA
4992
+ && 107 < vocab.id_to_token.size()
4993
+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4994
+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4995
+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4996
+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4997
+ vocab.special_prefix_id = 67;
4998
+ vocab.special_suffix_id = 69;
4999
+ vocab.special_middle_id = 68;
5000
+ // TODO: this is not EOT, it is "file separator" token, needs fix
5001
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
5002
+ //vocab.special_eot_id = 70;
5003
+ vocab.special_eot_id = 107;
5004
+ }
5005
+ }
5006
+
4764
5007
  try {
4765
5008
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
4766
5009
  } catch (const std::exception & e) {
@@ -4812,10 +5055,10 @@ static void llm_load_vocab(
4812
5055
  bool temp = true;
4813
5056
 
4814
5057
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
4815
- vocab.special_add_bos = int(temp);
5058
+ vocab.tokenizer_add_bos = temp;
4816
5059
  }
4817
5060
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
4818
- vocab.special_add_eos = int(temp);
5061
+ vocab.tokenizer_add_eos = temp;
4819
5062
  }
4820
5063
  }
4821
5064
 
@@ -4915,7 +5158,7 @@ static void llm_load_vocab(
4915
5158
  );
4916
5159
 
4917
5160
  // set attributes by model/tokenizer name
4918
- if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
5161
+ if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
4919
5162
  _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4920
5163
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4921
5164
  for (auto id : vocab.cache_special_tokens) {
@@ -5009,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5009
5252
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
5010
5253
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
5011
5254
 
5255
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
5256
+
5012
5257
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
5013
5258
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
5014
5259
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -5018,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5018
5263
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5019
5264
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5020
5265
  }
5266
+
5267
+ if (model.arch == LLM_ARCH_QWEN2MOE) {
5268
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5269
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5270
+ }
5021
5271
  }
5022
5272
 
5023
5273
  // Returns false if cancelled by progress_callback
@@ -5161,7 +5411,7 @@ static bool llm_load_tensors(
5161
5411
  // create tensors for the weights
5162
5412
  {
5163
5413
  const int64_t n_embd = hparams.n_embd;
5164
- const int64_t n_embd_head = n_embd / hparams.n_head;
5414
+ const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
5165
5415
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
5166
5416
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
5167
5417
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -5515,7 +5765,7 @@ static bool llm_load_tensors(
5515
5765
 
5516
5766
  layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5517
5767
  } else {
5518
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5768
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5519
5769
  }
5520
5770
 
5521
5771
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -5556,6 +5806,9 @@ static bool llm_load_tensors(
5556
5806
  layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5557
5807
  layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5558
5808
 
5809
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5810
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5811
+
5559
5812
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5560
5813
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5561
5814
 
@@ -5801,16 +6054,17 @@ static bool llm_load_tensors(
5801
6054
  GGML_ASSERT(hparams.n_expert_used > 0);
5802
6055
 
5803
6056
  // MoE branch
5804
- auto n_ff_exp = n_ff / hparams.n_expert_used;
6057
+ auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
5805
6058
  layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5806
6059
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5807
6060
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5808
6061
 
5809
6062
  // Shared expert branch
6063
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
5810
6064
  layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5811
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5812
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5813
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
6065
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
6066
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
6067
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
5814
6068
  }
5815
6069
  } break;
5816
6070
  case LLM_ARCH_PHI2:
@@ -6600,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6600
6854
  }
6601
6855
  #endif
6602
6856
 
6603
- #ifdef GGML_USE_SYCL
6604
- if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
6605
- ggml_backend_sycl_set_single_device_mode(params.main_gpu);
6606
- //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
6607
- params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
6608
- } else {
6609
- ggml_backend_sycl_set_mul_device_mode();
6610
- }
6611
- #endif
6612
-
6613
6857
  if (!llm_load_tensors(
6614
6858
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
6615
6859
  params.progress_callback, params.progress_callback_user_data
@@ -7410,6 +7654,50 @@ struct llm_build_context {
7410
7654
  return lctx.inp_s_seq;
7411
7655
  }
7412
7656
 
7657
+ struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
7658
+ // find result_norm tensor for input
7659
+ struct ggml_tensor * inp = nullptr;
7660
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
7661
+ inp = gf->nodes[i];
7662
+ if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
7663
+ break;
7664
+ } else {
7665
+ inp = nullptr;
7666
+ }
7667
+ }
7668
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
7669
+
7670
+ struct ggml_tensor * cur;
7671
+
7672
+ switch (pooling_type) {
7673
+ case LLAMA_POOLING_TYPE_MEAN:
7674
+ {
7675
+ struct ggml_tensor * inp_mean = build_inp_mean();
7676
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
7677
+ } break;
7678
+ case LLAMA_POOLING_TYPE_CLS:
7679
+ case LLAMA_POOLING_TYPE_LAST:
7680
+ {
7681
+ struct ggml_tensor * inp_cls = build_inp_cls();
7682
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
7683
+ } break;
7684
+ case LLAMA_POOLING_TYPE_NONE:
7685
+ {
7686
+ cur = inp;
7687
+ } break;
7688
+ default:
7689
+ {
7690
+ GGML_ASSERT(false && "unknown pooling type");
7691
+ } break;
7692
+ }
7693
+
7694
+ cb(cur, "result_embd_pooled", -1);
7695
+
7696
+ ggml_build_forward_expand(gf, cur);
7697
+
7698
+ return gf;
7699
+ }
7700
+
7413
7701
  struct ggml_cgraph * build_llama() {
7414
7702
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7415
7703
 
@@ -8390,8 +8678,6 @@ struct llm_build_context {
8390
8678
  if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8391
8679
  inp_pos = build_inp_pos();
8392
8680
  }
8393
- struct ggml_tensor * inp_mean = build_inp_mean();
8394
- struct ggml_tensor * inp_cls = build_inp_cls();
8395
8681
 
8396
8682
  // construct input embeddings (token, type, position)
8397
8683
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8519,6 +8805,11 @@ struct llm_build_context {
8519
8805
  // attention layer norm
8520
8806
  cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
8521
8807
 
8808
+ if (model.layers[il].attn_norm_2 != nullptr) {
8809
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
8810
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
8811
+ }
8812
+
8522
8813
  struct ggml_tensor * ffn_inp = cur;
8523
8814
  cb(ffn_inp, "ffn_inp", il);
8524
8815
 
@@ -8561,28 +8852,6 @@ struct llm_build_context {
8561
8852
  cur = inpL;
8562
8853
  cb(cur, "result_embd", -1);
8563
8854
 
8564
- // pooling layer
8565
- switch (pooling_type) {
8566
- case LLAMA_POOLING_TYPE_NONE:
8567
- {
8568
- // nop
8569
- } break;
8570
- case LLAMA_POOLING_TYPE_MEAN:
8571
- {
8572
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
8573
- cb(cur, "result_embd_pooled", -1);
8574
- } break;
8575
- case LLAMA_POOLING_TYPE_CLS:
8576
- {
8577
- cur = ggml_get_rows(ctx0, cur, inp_cls);
8578
- cb(cur, "result_embd_pooled", -1);
8579
- } break;
8580
- case LLAMA_POOLING_TYPE_UNSPECIFIED:
8581
- {
8582
- GGML_ASSERT(false && "Invalid pooling type");
8583
- } break;
8584
- }
8585
-
8586
8855
  ggml_build_forward_expand(gf, cur);
8587
8856
 
8588
8857
  return gf;
@@ -11520,7 +11789,8 @@ static struct ggml_cgraph * llama_build_graph(
11520
11789
  if (batch.n_tokens < 32 || full_offload) {
11521
11790
  if (il != -1 && strcmp(name, "norm") == 0) {
11522
11791
  for (auto * backend : lctx.backends) {
11523
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
11792
+ if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
11793
+ (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
11524
11794
  ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
11525
11795
  break;
11526
11796
  }
@@ -11666,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
11666
11936
  GGML_ASSERT(false);
11667
11937
  }
11668
11938
 
11939
+ // add on pooling layer
11940
+ if (lctx.cparams.embeddings) {
11941
+ result = llm.append_pooling(result);
11942
+ }
11943
+
11669
11944
  llm.free();
11670
11945
 
11671
11946
  return result;
@@ -11755,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11755
12030
  // (!a || b) is a logical implication (a -> b)
11756
12031
  // !hparams.causal_attn -> !cparams.causal_attn
11757
12032
  (hparams.causal_attn || !cparams.causal_attn) &&
11758
- "causal attention with embedding models is not supported"
12033
+ "causal attention is not supported by this model"
11759
12034
  );
11760
12035
 
11761
12036
  if (lctx.inp_KQ_mask) {
@@ -11887,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11887
12162
  }
11888
12163
  }
11889
12164
 
12165
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
12166
+ const int64_t n_tokens = batch.n_tokens;
12167
+
12168
+ GGML_ASSERT(lctx.inp_cls);
12169
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
12170
+
12171
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
12172
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
12173
+
12174
+ std::vector<int> last_pos(n_tokens, -1);
12175
+ std::vector<int> last_row(n_tokens, -1);
12176
+
12177
+ for (int i = 0; i < n_tokens; ++i) {
12178
+ const llama_seq_id seq_id = batch.seq_id[i][0];
12179
+ const llama_pos pos = batch.pos[i];
12180
+
12181
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
12182
+
12183
+ if (pos >= last_pos[seq_id]) {
12184
+ last_pos[seq_id] = pos;
12185
+ last_row[seq_id] = i;
12186
+ }
12187
+ }
12188
+
12189
+ for (int i = 0; i < n_tokens; ++i) {
12190
+ if (last_row[i] >= 0) {
12191
+ data[i] = last_row[i];
12192
+ }
12193
+ }
12194
+ }
12195
+
11890
12196
  if (kv_self.recurrent) {
11891
12197
  const int64_t n_kv = kv_self.n;
11892
12198
 
@@ -11948,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
11948
12254
  const auto n_embd = hparams.n_embd;
11949
12255
 
11950
12256
  // TODO: use a per-batch flag for logits presence instead
11951
- const bool has_logits = cparams.causal_attn;
11952
- const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
12257
+ const bool has_logits = !cparams.embeddings;
12258
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
11953
12259
 
11954
12260
  const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
11955
12261
  const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -12017,6 +12323,11 @@ static void llama_graph_compute(
12017
12323
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
12018
12324
  ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
12019
12325
  }
12326
+ #ifdef GGML_USE_BLAS
12327
+ if (lctx.backend_blas != nullptr) {
12328
+ ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
12329
+ }
12330
+ #endif
12020
12331
 
12021
12332
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
12022
12333
 
@@ -12074,11 +12385,13 @@ static int llama_decode_internal(
12074
12385
  std::vector<std::vector<llama_seq_id>> seq_id;
12075
12386
 
12076
12387
  // count outputs
12077
- if (batch_all.logits) {
12388
+ if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
12389
+ n_outputs = n_tokens_all;
12390
+ } else if (batch_all.logits) {
12078
12391
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
12079
12392
  n_outputs += batch_all.logits[i] != 0;
12080
12393
  }
12081
- } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
12394
+ } else if (lctx.logits_all) {
12082
12395
  n_outputs = n_tokens_all;
12083
12396
  } else {
12084
12397
  // keep last output only
@@ -12209,47 +12522,19 @@ static int llama_decode_internal(
12209
12522
  // no output
12210
12523
  res = nullptr;
12211
12524
  embd = nullptr;
12212
- } else if (!hparams.causal_attn) {
12213
- res = nullptr; // do not extract logits for embedding models such as BERT
12214
-
12215
- // token or sequence embeddings
12216
- embd = gf->nodes[gf->n_nodes - 1];
12217
-
12218
- GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
12219
12525
  } else if (cparams.embeddings) {
12220
- // the embeddings could be in the second to last tensor, or any of the previous tensors
12221
- int i_embd = gf->n_nodes - 2;
12222
- for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
12223
- i_embd = gf->n_nodes - i;
12224
- if (i_embd < 0) { break; }
12225
- embd = gf->nodes[i_embd];
12226
- }
12227
- GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
12228
-
12229
- // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
12230
- if (!cparams.causal_attn) {
12231
- res = nullptr; // do not extract logits when not needed
12232
- // skip computing logits
12233
- // TODO: is this safe?
12234
- gf->n_nodes = i_embd + 1;
12526
+ res = nullptr; // do not extract logits for embedding case
12527
+ embd = gf->nodes[gf->n_nodes - 1];
12528
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
12529
+ embd = gf->nodes[gf->n_nodes - 2];
12235
12530
  }
12531
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
12236
12532
  } else {
12237
12533
  embd = nullptr; // do not extract embeddings when not needed
12238
12534
  GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
12239
12535
  }
12240
12536
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
12241
12537
 
12242
- // for big prompts, if BLAS is enabled, it is better to use only one thread
12243
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
12244
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
12245
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
12246
- // with the BLAS calls. need a better solution
12247
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
12248
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
12249
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
12250
- n_threads = std::min(4, n_threads);
12251
- }
12252
-
12253
12538
  ggml_backend_sched_alloc_graph(lctx.sched, gf);
12254
12539
 
12255
12540
  llama_set_inputs(lctx, u_batch);
@@ -12312,11 +12597,10 @@ static int llama_decode_internal(
12312
12597
  ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
12313
12598
  }
12314
12599
  } break;
12315
- case LLAMA_POOLING_TYPE_CLS:
12316
12600
  case LLAMA_POOLING_TYPE_MEAN:
12601
+ case LLAMA_POOLING_TYPE_CLS:
12602
+ case LLAMA_POOLING_TYPE_LAST:
12317
12603
  {
12318
- GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
12319
-
12320
12604
  // extract sequence embeddings
12321
12605
  auto & embd_seq_out = lctx.embd_seq;
12322
12606
  embd_seq_out.clear();
@@ -12930,107 +13214,142 @@ struct llm_bigram_bpe {
12930
13214
  };
12931
13215
 
12932
13216
  struct llm_tokenizer_bpe {
12933
- llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
12934
-
12935
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12936
- int final_prev_index = -1;
12937
- bool ignore_merges = false;
12938
-
12939
- std::vector<std::string> word_collection;
12940
- switch (vocab.type) {
12941
- case LLAMA_VOCAB_TYPE_BPE:
12942
- switch (vocab.type_pre) {
12943
- case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12944
- ignore_merges = true;
12945
- word_collection = unicode_regex_split(text, {
12946
- // original regex from tokenizer.json
12947
- //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12948
-
12949
- // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12950
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12951
- });
12952
- break;
12953
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12954
- case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12955
- word_collection = unicode_regex_split(text, {
12956
- // same as llama3
12957
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12958
- });
12959
- break;
12960
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12961
- word_collection = unicode_regex_split(text, {
12962
- "[\r\n]",
12963
- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12964
- "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12965
- "\\s+$",
12966
- "[一-龥ࠀ-一가-퟿]+",
12967
- "\\p{N}+",
12968
- });
12969
- break;
12970
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12971
- word_collection = unicode_regex_split(text, {
12972
- "[\r\n]",
12973
- "\\s?\\p{L}+",
12974
- "\\s?\\p{P}+",
12975
- "[一-龥ࠀ-一가-퟿]+",
12976
- "\\p{N}",
12977
- });
12978
- break;
12979
- case LLAMA_VOCAB_PRE_TYPE_FALCON:
12980
- word_collection = unicode_regex_split(text, {
12981
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
12982
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12983
- "[0-9][0-9][0-9]",
12984
- });
12985
- break;
12986
- case LLAMA_VOCAB_PRE_TYPE_MPT:
12987
- // TODO: MPT pre-tokenization regexes are unknown
12988
- // the following are close, but not exact. run the following:
12989
- // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12990
- GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12991
- word_collection = unicode_regex_split(text, {
12992
- "\\s?\\p{L}+",
12993
- "\\s?\\p{P}+",
12994
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12995
- });
12996
- break;
12997
- case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12998
- case LLAMA_VOCAB_PRE_TYPE_REFACT:
12999
- case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13000
- word_collection = unicode_regex_split(text, {
13001
- "\\p{N}",
13002
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13003
- });
13004
- break;
13005
- case LLAMA_VOCAB_PRE_TYPE_GPT2:
13006
- case LLAMA_VOCAB_PRE_TYPE_OLMO:
13007
- word_collection = unicode_regex_split(text, {
13008
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13009
- });
13010
- break;
13011
- case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13012
- case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13013
- word_collection = unicode_regex_split(text, {
13014
- // original regex from tokenizer.json
13015
- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13016
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13017
- });
13018
- break;
13019
- default:
13020
- // default regex for BPE tokenization pre-processing
13021
- word_collection = unicode_regex_split(text, {
13022
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
13023
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13024
- "\\p{N}+",
13025
- "[0-9][0-9][0-9]",
13026
- });
13027
- break;
13028
- }
13217
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
13218
+ GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
13219
+ switch (vocab.type_pre) {
13220
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
13221
+ regex_exprs = {
13222
+ // original regex from tokenizer.json
13223
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13224
+
13225
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
13226
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13227
+ };
13228
+ break;
13229
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
13230
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
13231
+ regex_exprs = {
13232
+ // same as llama3
13233
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13234
+ };
13235
+ break;
13236
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
13237
+ regex_exprs = {
13238
+ "[\r\n]",
13239
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
13240
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
13241
+ "\\s+$",
13242
+ "[一-龥ࠀ-一가-퟿]+",
13243
+ "\\p{N}+",
13244
+ };
13245
+ break;
13246
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
13247
+ regex_exprs = {
13248
+ "[\r\n]",
13249
+ "\\s?\\p{L}+",
13250
+ "\\s?\\p{P}+",
13251
+ "[一-龥ࠀ-一가-퟿]+",
13252
+ "\\p{N}",
13253
+ };
13254
+ break;
13255
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
13256
+ regex_exprs = {
13257
+ "[\\p{P}\\$\\+<=>\\^~\\|`]+",
13258
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13259
+ "[0-9][0-9][0-9]",
13260
+ };
13261
+ break;
13262
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
13263
+ // TODO: MPT pre-tokenization regexes are unknown
13264
+ // the following are close, but not exact. run the following:
13265
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
13266
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
13267
+ regex_exprs = {
13268
+ "\\s?\\p{L}+",
13269
+ "\\s?\\p{P}+",
13270
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13271
+ };
13272
+ break;
13273
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
13274
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
13275
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13276
+ regex_exprs = {
13277
+ "\\p{N}",
13278
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13279
+ };
13280
+ break;
13281
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
13282
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
13283
+ regex_exprs = {
13284
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13285
+ };
13286
+ break;
13287
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13288
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13289
+ regex_exprs = {
13290
+ // original regex from tokenizer.json
13291
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13292
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13293
+ };
13294
+ break;
13295
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13296
+ regex_exprs = {
13297
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13298
+ };
13029
13299
  break;
13030
13300
  default:
13031
- GGML_ASSERT(false);
13301
+ // default regex for BPE tokenization pre-processing
13302
+ regex_exprs = {
13303
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
13304
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13305
+ "\\p{N}+",
13306
+ "[0-9][0-9][0-9]",
13307
+ };
13032
13308
  break;
13033
13309
  }
13310
+ }
13311
+
13312
+ void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
13313
+ output.push_back(token_id);
13314
+ }
13315
+
13316
+ bool append_bos(std::vector<llama_vocab::id> & output) const {
13317
+ if (vocab.tokenizer_add_bos) {
13318
+ GGML_ASSERT(vocab.special_bos_id != -1);
13319
+ output.push_back(vocab.special_bos_id);
13320
+ return true;
13321
+ }
13322
+ return false;
13323
+ }
13324
+
13325
+ bool append_eos(std::vector<llama_vocab::id> & output) const {
13326
+ if (vocab.tokenizer_add_eos) {
13327
+ GGML_ASSERT(vocab.special_eos_id != -1);
13328
+ output.push_back(vocab.special_eos_id);
13329
+ return true;
13330
+ }
13331
+ return false;
13332
+ }
13333
+
13334
+ void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
13335
+ if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13336
+ LLAMA_LOG_WARN(
13337
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13338
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13339
+ "Are you sure this is what you want?\n", __FUNCTION__);
13340
+ }
13341
+ if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
13342
+ LLAMA_LOG_WARN(
13343
+ "%s: Added a EOS token to the prompt as specified by the model but the prompt "
13344
+ "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
13345
+ "Are you sure this is what you want?\n", __FUNCTION__);
13346
+ }
13347
+ }
13348
+
13349
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13350
+ int final_prev_index = -1;
13351
+
13352
+ const auto word_collection = unicode_regex_split(text, regex_exprs);
13034
13353
 
13035
13354
  symbols_final.clear();
13036
13355
 
@@ -13041,7 +13360,7 @@ struct llm_tokenizer_bpe {
13041
13360
  int index = 0;
13042
13361
  size_t offset = 0;
13043
13362
 
13044
- if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13363
+ if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13045
13364
  symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
13046
13365
  offset = word.size();
13047
13366
  }
@@ -13122,10 +13441,9 @@ struct llm_tokenizer_bpe {
13122
13441
  for (auto j = str.begin(); j != str.end(); ++j) {
13123
13442
  std::string byte_str(1, *j);
13124
13443
  auto token_multibyte = vocab.token_to_id.find(byte_str);
13125
- if (token_multibyte == vocab.token_to_id.end()) {
13126
- throw std::runtime_error("ERROR: byte not found in vocab");
13444
+ if (token_multibyte != vocab.token_to_id.end()) {
13445
+ output.push_back(token_multibyte->second);
13127
13446
  }
13128
- output.push_back((*token_multibyte).second);
13129
13447
  }
13130
13448
  } else {
13131
13449
  output.push_back((*token).second);
@@ -13164,6 +13482,8 @@ private:
13164
13482
 
13165
13483
  const llama_vocab & vocab;
13166
13484
 
13485
+ std::vector<std::string> regex_exprs;
13486
+
13167
13487
  std::vector<llm_symbol> symbols;
13168
13488
  std::vector<llm_symbol> symbols_final;
13169
13489
 
@@ -13173,7 +13493,7 @@ private:
13173
13493
  struct llm_tokenizer_wpm {
13174
13494
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
13175
13495
 
13176
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13496
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
13177
13497
  const auto & token_map = vocab.token_to_id;
13178
13498
 
13179
13499
  // normalize and split by whitespace
@@ -13182,7 +13502,7 @@ struct llm_tokenizer_wpm {
13182
13502
  // bos token prepended already
13183
13503
 
13184
13504
  // find the longest tokens that form the words
13185
- for (const std::string &word : words) {
13505
+ for (const std::string & word : words) {
13186
13506
  // skip empty words
13187
13507
  if (word.size() == 0) {
13188
13508
  continue;
@@ -13199,7 +13519,7 @@ struct llm_tokenizer_wpm {
13199
13519
  for (int i = 0; i < n; ++i) {
13200
13520
  // loop through possible match length
13201
13521
  bool match = false;
13202
- for (int j = n; j > i; j--) {
13522
+ for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
13203
13523
  auto it = token_map.find(word1.substr(i, j - i));
13204
13524
  if (it != token_map.end()) {
13205
13525
  output.push_back(it->second);
@@ -13222,11 +13542,12 @@ struct llm_tokenizer_wpm {
13222
13542
  }
13223
13543
  }
13224
13544
 
13225
- std::vector<std::string> preprocess(const std::string & text) {
13545
+ // TODO: reduce string copies by using cpts_offs array
13546
+ std::vector<std::string> preprocess(const std::string & text) const {
13226
13547
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13227
13548
  std::vector<std::string> words(1, "");
13228
13549
 
13229
- for (const char32_t cpt : cpts_nfd) {
13550
+ for (const uint32_t cpt : cpts_nfd) {
13230
13551
  const auto flags = unicode_cpt_flags(cpt);
13231
13552
 
13232
13553
  if (flags.is_whitespace) {
@@ -13444,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13444
13765
 
13445
13766
  bool is_prev_special = false;
13446
13767
 
13447
- if (add_special && vocab.special_add_bos != 0) {
13768
+ if (add_special && vocab.tokenizer_add_bos) {
13448
13769
  GGML_ASSERT(vocab.special_bos_id != -1);
13449
13770
  output.push_back(vocab.special_bos_id);
13450
13771
  is_prev_special = true;
@@ -13454,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13454
13775
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13455
13776
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13456
13777
 
13457
- if (vocab.add_space_prefix) {
13778
+ if (vocab.tokenizer_add_space_prefix) {
13458
13779
  if (!output.size() || is_prev_special) { // prefix with space if first token
13459
13780
  raw_text = " " + raw_text;
13460
13781
  }
@@ -13472,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13472
13793
  }
13473
13794
  }
13474
13795
 
13475
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13796
+ if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13476
13797
  LLAMA_LOG_WARN(
13477
13798
  "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13478
13799
  "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13479
13800
  "Are you sure this is what you want?\n", __FUNCTION__);
13480
13801
  }
13481
13802
 
13482
- if (add_special && vocab.special_add_eos == 1) {
13803
+ if (add_special && vocab.tokenizer_add_eos) {
13483
13804
  GGML_ASSERT(vocab.special_eos_id != -1);
13484
13805
  output.push_back(vocab.special_eos_id);
13485
13806
  }
13486
13807
  } break;
13487
13808
  case LLAMA_VOCAB_TYPE_BPE:
13488
13809
  {
13489
- if (add_special && vocab.special_add_bos != 0) {
13490
- GGML_ASSERT(vocab.special_bos_id != -1);
13491
- output.push_back(vocab.special_bos_id);
13810
+ llm_tokenizer_bpe tokenizer(vocab);
13811
+
13812
+ if (add_special) {
13813
+ tokenizer.append_bos(output);
13492
13814
  }
13493
13815
 
13494
13816
  for (const auto & fragment : fragment_buffer) {
@@ -13498,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13498
13820
  #ifdef PRETOKENIZERDEBUG
13499
13821
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13500
13822
  #endif
13501
- llm_tokenizer_bpe tokenizer(vocab);
13502
13823
  tokenizer.tokenize(raw_text, output);
13503
13824
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13504
- output.push_back(fragment.token);
13825
+ tokenizer.append(fragment.token, output);
13505
13826
  }
13506
13827
  }
13507
13828
 
13508
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13509
- LLAMA_LOG_WARN(
13510
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13511
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13512
- "Are you sure this is what you want?\n", __FUNCTION__);
13513
- }
13514
-
13515
- if (add_special && vocab.special_add_eos == 1) {
13516
- GGML_ASSERT(vocab.special_add_eos != -1);
13517
- output.push_back(vocab.special_eos_id);
13829
+ if (add_special) {
13830
+ tokenizer.append_eos(output);
13831
+ tokenizer.check_double_bos_eos(output);
13518
13832
  }
13519
13833
  } break;
13520
13834
  case LLAMA_VOCAB_TYPE_WPM:
@@ -13524,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13524
13838
  output.push_back(vocab.special_cls_id);
13525
13839
  }
13526
13840
 
13841
+ llm_tokenizer_wpm tokenizer(vocab);
13842
+
13527
13843
  for (const auto & fragment : fragment_buffer) {
13528
13844
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13529
13845
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -13531,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13531
13847
  #ifdef PRETOKENIZERDEBUG
13532
13848
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13533
13849
  #endif
13534
- llm_tokenizer_wpm tokenizer(vocab);
13535
13850
  tokenizer.tokenize(raw_text, output);
13536
13851
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13537
13852
  output.push_back(fragment.token);
@@ -13631,7 +13946,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13631
13946
  const uint32_t chr) {
13632
13947
 
13633
13948
  bool found = false;
13634
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13949
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13635
13950
 
13636
13951
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
13637
13952
 
@@ -13640,6 +13955,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13640
13955
  // inclusive range, e.g. [a-z]
13641
13956
  found = found || (pos->value <= chr && chr <= pos[1].value);
13642
13957
  pos += 2;
13958
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13959
+ // Any character matches "."
13960
+ found = true;
13961
+ pos += 1;
13643
13962
  } else {
13644
13963
  // exact char match, e.g. [a] or "a"
13645
13964
  found = found || pos->value == chr;
@@ -13657,7 +13976,7 @@ static bool llama_grammar_match_partial_char(
13657
13976
  const llama_grammar_element * pos,
13658
13977
  const llama_partial_utf8 partial_utf8) {
13659
13978
 
13660
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13979
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13661
13980
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
13662
13981
 
13663
13982
  uint32_t partial_value = partial_utf8.value;
@@ -13687,6 +14006,9 @@ static bool llama_grammar_match_partial_char(
13687
14006
  return is_positive_char;
13688
14007
  }
13689
14008
  pos += 2;
14009
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
14010
+ // Any character matches "."
14011
+ return true;
13690
14012
  } else {
13691
14013
  // exact char match, e.g. [a] or "a"
13692
14014
  if (low <= pos->value && pos->value <= high) {
@@ -13747,6 +14069,7 @@ static void llama_grammar_advance_stack(
13747
14069
  }
13748
14070
  case LLAMA_GRETYPE_CHAR:
13749
14071
  case LLAMA_GRETYPE_CHAR_NOT:
14072
+ case LLAMA_GRETYPE_CHAR_ANY:
13750
14073
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
13751
14074
  // only add the stack if it's not a duplicate of one we already have
13752
14075
  new_stacks.emplace_back(stack);
@@ -15220,6 +15543,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15220
15543
  if (imatrix_data) {
15221
15544
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
15222
15545
  qs.has_imatrix = true;
15546
+ // check imatrix for nans or infs
15547
+ for (const auto & kv : *imatrix_data) {
15548
+ for (float f : kv.second) {
15549
+ if (!std::isfinite(f)) {
15550
+ throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
15551
+ }
15552
+ }
15553
+ }
15223
15554
  }
15224
15555
  }
15225
15556
 
@@ -16024,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
16024
16355
  params.flash_attn = false;
16025
16356
  }
16026
16357
 
16358
+ if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
16359
+ LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
16360
+ params.flash_attn = false;
16361
+ }
16362
+
16027
16363
  if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16028
16364
  LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16029
16365
  return nullptr;
@@ -16195,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
16195
16531
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
16196
16532
  ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
16197
16533
  if (backend == nullptr) {
16198
- int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
16199
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
16534
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
16200
16535
  llama_free(ctx);
16201
16536
  return nullptr;
16202
16537
  }
@@ -16226,6 +16561,16 @@ struct llama_context * llama_new_context_with_model(
16226
16561
  ctx->backends.push_back(backend);
16227
16562
  }
16228
16563
  #endif
16564
+
16565
+ #ifdef GGML_USE_BLAS
16566
+ ctx->backend_blas = ggml_backend_blas_init();
16567
+ if (ctx->backend_blas == nullptr) {
16568
+ LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
16569
+ } else {
16570
+ ctx->backends.push_back(ctx->backend_blas);
16571
+ }
16572
+ #endif
16573
+
16229
16574
  #if defined(GGML_USE_RPC)
16230
16575
  if (model->n_gpu_layers > 0) {
16231
16576
  for (const auto & endpoint : model->rpc_servers) {
@@ -17814,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
17814
18159
  ctx->abort_callback_data = abort_callback_data;
17815
18160
  }
17816
18161
 
18162
+ void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
18163
+ ctx->cparams.embeddings = embeddings;
18164
+ }
18165
+
17817
18166
  void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
17818
18167
  ctx->cparams.causal_attn = causal_attn;
17819
18168
  }
@@ -18057,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
18057
18406
  }
18058
18407
 
18059
18408
  int32_t llama_add_bos_token(const struct llama_model * model) {
18060
- return model->vocab.special_add_bos;
18409
+ return model->vocab.tokenizer_add_bos;
18061
18410
  }
18062
18411
 
18063
18412
  int32_t llama_add_eos_token(const struct llama_model * model) {
18064
- return model->vocab.special_add_eos;
18413
+ return model->vocab.tokenizer_add_eos;
18065
18414
  }
18066
18415
 
18067
18416
  llama_token llama_token_prefix(const struct llama_model * model) {