@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -98,8 +98,8 @@ struct ring_buffer {
98
98
  std::vector<T> data;
99
99
  };
100
100
 
101
- struct gpt_sampler {
102
- gpt_sampler_params params;
101
+ struct common_sampler {
102
+ common_sampler_params params;
103
103
 
104
104
  struct llama_sampler * grmr;
105
105
  struct llama_sampler * chain;
@@ -125,26 +125,28 @@ struct gpt_sampler {
125
125
  }
126
126
  };
127
127
 
128
- std::string gpt_sampler_params::print() const {
128
+ std::string common_sampler_params::print() const {
129
129
  char result[1024];
130
130
 
131
131
  snprintf(result, sizeof(result),
132
132
  "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133
- "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
133
+ "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
134
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
134
135
  "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
135
136
  penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
136
- top_k, tfs_z, top_p, min_p, typ_p, temp,
137
+ dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
138
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
137
139
  mirostat, mirostat_eta, mirostat_tau);
138
140
 
139
141
  return std::string(result);
140
142
  }
141
143
 
142
- struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
144
+ struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
143
145
  llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
144
146
 
145
147
  lparams.no_perf = params.no_perf;
146
148
 
147
- auto * result = new gpt_sampler {
149
+ auto * result = new common_sampler {
148
150
  /* .params = */ params,
149
151
  /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
150
152
  /* .chain = */ llama_sampler_chain_init(lparams),
@@ -171,60 +173,60 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
171
173
  params.penalize_nl,
172
174
  params.ignore_eos));
173
175
 
174
- if (params.temp > 0.0f) {
175
- if (params.mirostat == 0) {
176
- for (const auto & cnstr : params.samplers) {
177
- switch (cnstr) {
178
- case GPT_SAMPLER_TYPE_TOP_K:
179
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
176
+ if (params.mirostat == 0) {
177
+ for (const auto & cnstr : params.samplers) {
178
+ switch (cnstr) {
179
+ case COMMON_SAMPLER_TYPE_DRY:
180
+ {
181
+ std::vector<const char*> c_breakers;
182
+ c_breakers.reserve(params.dry_sequence_breakers.size());
183
+ for (const auto& str : params.dry_sequence_breakers) {
184
+ c_breakers.push_back(str.c_str());
185
+ }
186
+
187
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
188
+ }
180
189
  break;
181
- case GPT_SAMPLER_TYPE_TOP_P:
182
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
183
- break;
184
- case GPT_SAMPLER_TYPE_MIN_P:
185
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
186
- break;
187
- case GPT_SAMPLER_TYPE_TFS_Z:
188
- llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
189
- break;
190
- case GPT_SAMPLER_TYPE_TYPICAL_P:
191
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
192
- break;
193
- case GPT_SAMPLER_TYPE_TEMPERATURE:
194
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
195
- break;
196
- default:
197
- GGML_ASSERT(false && "unknown sampler type");
198
- }
190
+ case COMMON_SAMPLER_TYPE_TOP_K:
191
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
192
+ break;
193
+ case COMMON_SAMPLER_TYPE_TOP_P:
194
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
195
+ break;
196
+ case COMMON_SAMPLER_TYPE_MIN_P:
197
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
198
+ break;
199
+ case COMMON_SAMPLER_TYPE_XTC:
200
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
201
+ break;
202
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
203
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
204
+ break;
205
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
206
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
207
+ break;
208
+ case COMMON_SAMPLER_TYPE_INFILL:
209
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
210
+ break;
211
+ default:
212
+ GGML_ASSERT(false && "unknown sampler type");
199
213
  }
200
- llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
201
- llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
202
- } else if (params.mirostat == 1) {
203
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
204
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
205
- } else if (params.mirostat == 2) {
206
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
207
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
208
- } else {
209
- GGML_ASSERT(false && "unknown mirostat version");
210
214
  }
215
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
216
+ } else if (params.mirostat == 1) {
217
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
218
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
219
+ } else if (params.mirostat == 2) {
220
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
221
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
211
222
  } else {
212
- if (params.n_probs > 0) {
213
- // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
214
- // ref: https://github.com/ggerganov/llama.cpp/pull/9605
215
- //
216
- // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
217
- // it is much faster, since we avoid sorting all tokens and should give a good approximation
218
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
219
- llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
220
- }
221
- llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
223
+ GGML_ASSERT(false && "unknown mirostat version");
222
224
  }
223
225
 
224
226
  return result;
225
227
  }
226
228
 
227
- void gpt_sampler_free(struct gpt_sampler * gsmpl) {
229
+ void common_sampler_free(struct common_sampler * gsmpl) {
228
230
  if (gsmpl) {
229
231
  llama_sampler_free(gsmpl->grmr);
230
232
 
@@ -234,7 +236,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
234
236
  }
235
237
  }
236
238
 
237
- void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
239
+ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
238
240
  if (accept_grammar) {
239
241
  llama_sampler_accept(gsmpl->grmr, token);
240
242
  }
@@ -244,14 +246,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
244
246
  gsmpl->prev.push_back(token);
245
247
  }
246
248
 
247
- void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
249
+ void common_sampler_reset(struct common_sampler * gsmpl) {
248
250
  llama_sampler_reset(gsmpl->grmr);
249
251
 
250
252
  llama_sampler_reset(gsmpl->chain);
251
253
  }
252
254
 
253
- struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
254
- return new gpt_sampler {
255
+ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
256
+ return new common_sampler {
255
257
  /* .params = */ gsmpl->params,
256
258
  /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
257
259
  /* .chain = */ llama_sampler_clone(gsmpl->chain),
@@ -261,7 +263,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
261
263
  };
262
264
  }
263
265
 
264
- void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
266
+ void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
265
267
  // TODO: measure grammar performance
266
268
 
267
269
  if (gsmpl) {
@@ -272,7 +274,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
272
274
  }
273
275
  }
274
276
 
275
- llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
277
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
276
278
  gsmpl->set_logits(ctx, idx);
277
279
 
278
280
  auto & grmr = gsmpl->grmr;
@@ -318,21 +320,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
318
320
  return cur_p.data[cur_p.selected].id;
319
321
  }
320
322
 
321
- uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
323
+ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
322
324
  return llama_sampler_get_seed(gsmpl->chain);
323
325
  }
324
326
 
325
327
  // helpers
326
328
 
327
- llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
329
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
328
330
  return &gsmpl->cur_p;
329
331
  }
330
332
 
331
- llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
333
+ llama_token common_sampler_last(const struct common_sampler * gsmpl) {
332
334
  return gsmpl->prev.rat(0);
333
335
  }
334
336
 
335
- std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
337
+ std::string common_sampler_print(const struct common_sampler * gsmpl) {
336
338
  std::string result = "logits ";
337
339
 
338
340
  for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
@@ -343,7 +345,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
343
345
  return result;
344
346
  }
345
347
 
346
- std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
348
+ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
347
349
  n = std::min(n, (int) gsmpl->prev.size());
348
350
 
349
351
  if (n <= 0) {
@@ -358,63 +360,67 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
358
360
 
359
361
  GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
360
362
 
361
- result += llama_token_to_piece(ctx_main, id);
363
+ result += common_token_to_piece(ctx_main, id);
362
364
  }
363
365
 
364
366
  return result;
365
367
  }
366
368
 
367
- char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
369
+ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
368
370
  switch (cnstr) {
369
- case GPT_SAMPLER_TYPE_TOP_K: return 'k';
370
- case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
371
- case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
372
- case GPT_SAMPLER_TYPE_TOP_P: return 'p';
373
- case GPT_SAMPLER_TYPE_MIN_P: return 'm';
374
- case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
371
+ case COMMON_SAMPLER_TYPE_DRY: return 'd';
372
+ case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
373
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
374
+ case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
375
+ case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
376
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
377
+ case COMMON_SAMPLER_TYPE_XTC: return 'x';
378
+ case COMMON_SAMPLER_TYPE_INFILL: return 'i';
375
379
  default : return '?';
376
380
  }
377
381
  }
378
382
 
379
- std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
383
+ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
380
384
  switch (cnstr) {
381
- case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
382
- case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
383
- case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
384
- case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
385
- case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
386
- case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
385
+ case COMMON_SAMPLER_TYPE_DRY: return "dry";
386
+ case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
387
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
388
+ case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
389
+ case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
390
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
391
+ case COMMON_SAMPLER_TYPE_XTC: return "xtc";
392
+ case COMMON_SAMPLER_TYPE_INFILL: return "infill";
387
393
  default : return "";
388
394
  }
389
395
  }
390
396
 
391
- std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
392
- std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
393
- { "top_k", GPT_SAMPLER_TYPE_TOP_K },
394
- { "top_p", GPT_SAMPLER_TYPE_TOP_P },
395
- { "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
396
- { "min_p", GPT_SAMPLER_TYPE_MIN_P },
397
- { "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
398
- { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
397
+ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
398
+ std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
399
+ { "dry", COMMON_SAMPLER_TYPE_DRY },
400
+ { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
401
+ { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
402
+ { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
403
+ { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
404
+ { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
405
+ { "xtc", COMMON_SAMPLER_TYPE_XTC },
406
+ { "infill", COMMON_SAMPLER_TYPE_INFILL },
399
407
  };
400
408
 
401
409
  // since samplers names are written multiple ways
402
410
  // make it ready for both system names and input names
403
- std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
404
- { "top-k", GPT_SAMPLER_TYPE_TOP_K },
405
- { "top-p", GPT_SAMPLER_TYPE_TOP_P },
406
- { "nucleus", GPT_SAMPLER_TYPE_TOP_P },
407
- { "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
408
- { "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
409
- { "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
410
- { "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
411
- { "min-p", GPT_SAMPLER_TYPE_MIN_P },
412
- { "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
413
- { "tfs", GPT_SAMPLER_TYPE_TFS_Z },
414
- { "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
411
+ std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
412
+ { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
413
+ { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
414
+ { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
415
+ { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
416
+ { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
417
+ { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
418
+ { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
419
+ { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
420
+ { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
415
421
  };
416
422
 
417
- std::vector<gpt_sampler_type> samplers;
423
+ std::vector<common_sampler_type> samplers;
418
424
  samplers.reserve(names.size());
419
425
 
420
426
  for (const auto & name : names) {
@@ -434,17 +440,19 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
434
440
  return samplers;
435
441
  }
436
442
 
437
- std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
438
- std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
439
- { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
440
- { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
441
- { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
442
- { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
443
- { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
444
- { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
443
+ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
444
+ std::unordered_map<char, common_sampler_type> sampler_name_map = {
445
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
446
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
447
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
448
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
449
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
450
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
451
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
452
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
445
453
  };
446
454
 
447
- std::vector<gpt_sampler_type> samplers;
455
+ std::vector<common_sampler_type> samplers;
448
456
  samplers.reserve(chars.size());
449
457
 
450
458
  for (const auto & c : chars) {
@@ -7,7 +7,7 @@
7
7
  #include <string>
8
8
  #include <vector>
9
9
 
10
- // gpt_sampler extends llama_sampler with additional functionality:
10
+ // common_sampler extends llama_sampler with additional functionality:
11
11
  //
12
12
  // - grammar support
13
13
  // - custom sampler logic based on the parameters
@@ -23,30 +23,30 @@
23
23
  // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
24
24
  // grammar constraints are applied to the full vocabulary and the token is resampled.
25
25
  //
26
- // The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
26
+ // The common_sampler also maintains a container with the last accepted tokens. In the future, this can
27
27
  // be moved into the core llama library.
28
28
  //
29
- // For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
29
+ // For convenience, the common_sampler also maintains a container with the current candidate tokens.
30
30
  // This can be used to access the probabilities of the rest of the non-sampled tokens.
31
31
  //
32
32
  // TODO: measure grammar performance
33
33
  //
34
34
 
35
- struct gpt_sampler;
35
+ struct common_sampler;
36
36
 
37
37
  // llama_sampler API overloads
38
38
 
39
- struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
39
+ struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
40
40
 
41
- void gpt_sampler_free(struct gpt_sampler * gsmpl);
41
+ void common_sampler_free(struct common_sampler * gsmpl);
42
42
 
43
43
  // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
44
- void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
45
- void gpt_sampler_reset (struct gpt_sampler * gsmpl);
46
- struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
44
+ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
45
+ void common_sampler_reset (struct common_sampler * gsmpl);
46
+ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
47
47
 
48
48
  // arguments can be nullptr to skip printing
49
- void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
49
+ void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
50
50
 
51
51
  // extended sampling implementation:
52
52
  //
@@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
58
58
  // if grammar_first is true, the grammar is applied before the samplers (slower)
59
59
  // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
60
60
  //
61
- llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
61
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
62
62
 
63
- uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
63
+ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
64
64
 
65
65
  // helpers
66
66
 
67
67
  // access the internal list of current candidate tokens
68
- llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
68
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
69
69
 
70
70
  // get the last accepted token
71
- llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
71
+ llama_token common_sampler_last(const struct common_sampler * gsmpl);
72
72
 
73
73
  // print the sampler chain into a string
74
- std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
74
+ std::string common_sampler_print(const struct common_sampler * gsmpl);
75
75
 
76
76
  // get a string representation of the last accepted tokens
77
- std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
77
+ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
78
78
 
79
- char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
80
- std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
79
+ char common_sampler_type_to_chr(enum common_sampler_type cnstr);
80
+ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
81
81
 
82
- std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
83
- std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
82
+ std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
83
+ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
@@ -186,18 +186,16 @@ The following compilation options are also available to tweak performance:
186
186
 
187
187
  | Option | Legal values | Default | Description |
188
188
  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
189
- | GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
190
- | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
191
- | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
192
189
  | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
193
190
  | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
194
191
  | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
195
- | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
196
192
  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
197
193
  | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
198
194
 
199
195
  ### MUSA
200
196
 
197
+ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
198
+
201
199
  - Using `make`:
202
200
  ```bash
203
201
  make GGML_MUSA=1
@@ -209,6 +207,12 @@ The following compilation options are also available to tweak performance:
209
207
  cmake --build build --config Release
210
208
  ```
211
209
 
210
+ The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
211
+
212
+ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
213
+
214
+ Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
215
+
212
216
  ### hipBLAS
213
217
 
214
218
  This provides BLAS acceleration on HIP-supported AMD GPUs.
@@ -222,7 +226,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
222
226
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
223
227
  ```bash
224
228
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
225
- cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
229
+ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
226
230
  && cmake --build build --config Release -- -j 16
227
231
  ```
228
232
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
@@ -239,7 +243,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
239
243
  ```bash
240
244
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
241
245
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
242
- cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
246
+ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
243
247
  && cmake --build build -- -j 16
244
248
  ```
245
249
 
@@ -251,7 +255,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
251
255
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
252
256
  ```bash
253
257
  set PATH=%HIP_PATH%\bin;%PATH%
254
- cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
258
+ cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
255
259
  cmake --build build
256
260
  ```
257
261
  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -260,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
260
264
 
261
265
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
262
266
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
263
- The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
264
-
265
- | Option | Legal values | Default | Description |
266
- |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
267
- | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
268
- | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
269
- | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
270
267
 
271
268
  ### Vulkan
272
269
 
@@ -274,9 +271,9 @@ The following compilation options are also available to tweak performance (yes,
274
271
 
275
272
  #### w64devkit
276
273
 
277
- Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
274
+ Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
278
275
 
279
- Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
276
+ Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
280
277
 
281
278
  Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
282
279
  ```sh
@@ -294,6 +291,29 @@ EOF
294
291
  ```
295
292
  Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
296
293
 
294
+ #### Git Bash MINGW64
295
+
296
+ Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
297
+
298
+ Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
299
+
300
+ Download and install [`CMake`](https://cmake.org/download/) with the default settings
301
+
302
+ Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
303
+
304
+ Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
305
+
306
+ ```
307
+ cmake -B build -DGGML_VULKAN=ON
308
+ cmake --build build --config Release
309
+ ```
310
+
311
+ Now you can load the model in conversation mode using `Vulkan`
312
+
313
+ ```
314
+ build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
315
+ ```
316
+
297
317
  #### MSYS2
298
318
  Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
299
319
  ```sh
@@ -367,7 +387,7 @@ cmake --build build --config release
367
387
 
368
388
  You can test with:
369
389
 
370
- `./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
390
+ `./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
371
391
 
372
392
  If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
373
393
  ```bash
@@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
13
13
  if (EMSCRIPTEN)
14
14
  else()
15
15
  add_subdirectory(cvector-generator)
16
- add_subdirectory(baby-llama)
17
16
  add_subdirectory(batched-bench)
18
17
  add_subdirectory(batched)
19
18
  add_subdirectory(convert-llama2c-to-ggml)
@@ -49,6 +48,7 @@ else()
49
48
  endif()
50
49
  add_subdirectory(save-load-state)
51
50
  add_subdirectory(simple)
51
+ add_subdirectory(simple-chat)
52
52
  add_subdirectory(speculative)
53
53
  add_subdirectory(tokenize)
54
54
  endif()