cui-llama.rn 1.0.7 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -174,7 +174,7 @@ Java_com_rnllama_LlamaContext_initContext(
174
174
 
175
175
  const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
176
176
  if (lora_chars != nullptr && lora_chars[0] != '\0') {
177
- defaultParams.lora_adapter.push_back({lora_chars, lora_scaled});
177
+ defaultParams.lora_adapters.push_back({lora_chars, lora_scaled});
178
178
  defaultParams.use_mmap = false;
179
179
  }
180
180
 
package/cpp/common.cpp CHANGED
@@ -690,14 +690,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
690
690
  }
691
691
  if (arg == "--lora") {
692
692
  CHECK_ARG
693
- params.lora_adapter.emplace_back(argv[i], 1.0f);
693
+ params.lora_adapters.push_back({
694
+ std::string(argv[i]),
695
+ 1.0,
696
+ });
694
697
  return true;
695
698
  }
696
699
  if (arg == "--lora-scaled") {
697
700
  CHECK_ARG
698
- const char* lora_adapter = argv[i];
701
+ std::string lora_adapter = argv[i];
699
702
  CHECK_ARG
700
- params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
703
+ params.lora_adapters.push_back({
704
+ lora_adapter,
705
+ std::stof(argv[i]),
706
+ });
707
+ return true;
708
+ }
709
+ if (arg == "--lora-init-without-apply") {
710
+ params.lora_init_without_apply = true;
701
711
  return true;
702
712
  }
703
713
  if (arg == "--control-vector") {
@@ -1660,6 +1670,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
1660
1670
  "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
1661
1671
  options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
1662
1672
  "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1673
+ options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
1663
1674
 
1664
1675
  #ifndef LOG_DISABLE_LOGS
1665
1676
  options.push_back({ "logging" });
@@ -1772,6 +1783,17 @@ std::string string_get_sortable_timestamp() {
1772
1783
  return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
1773
1784
  }
1774
1785
 
1786
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
1787
+ if (search.empty()) {
1788
+ return; // Avoid infinite loop if 'search' is an empty string
1789
+ }
1790
+ size_t pos = 0;
1791
+ while ((pos = s.find(search, pos)) != std::string::npos) {
1792
+ s.replace(pos, search.length(), replace);
1793
+ pos += replace.length();
1794
+ }
1795
+ }
1796
+
1775
1797
  void string_process_escapes(std::string & input) {
1776
1798
  std::size_t input_len = input.length();
1777
1799
  std::size_t output_idx = 0;
@@ -2045,8 +2067,8 @@ std::string fs_get_cache_file(const std::string & filename) {
2045
2067
  //
2046
2068
  // Model utils
2047
2069
  //
2048
-
2049
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2070
+ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2071
+ llama_init_result iparams;
2050
2072
  auto mparams = llama_model_params_from_gpt_params(params);
2051
2073
 
2052
2074
  llama_model * model = nullptr;
@@ -2061,7 +2083,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2061
2083
 
2062
2084
  if (model == NULL) {
2063
2085
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2064
- return std::make_tuple(nullptr, nullptr);
2086
+ return iparams;
2065
2087
  }
2066
2088
 
2067
2089
  auto cparams = llama_context_params_from_gpt_params(params);
@@ -2070,7 +2092,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2070
2092
  if (lctx == NULL) {
2071
2093
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
2072
2094
  llama_free_model(model);
2073
- return std::make_tuple(nullptr, nullptr);
2095
+ return iparams;
2074
2096
  }
2075
2097
 
2076
2098
  if (!params.control_vectors.empty()) {
@@ -2081,7 +2103,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2081
2103
  if (cvec.n_embd == -1) {
2082
2104
  llama_free(lctx);
2083
2105
  llama_free_model(model);
2084
- return std::make_tuple(nullptr, nullptr);
2106
+ return iparams;
2085
2107
  }
2086
2108
 
2087
2109
  int err = llama_control_vector_apply(lctx,
@@ -2093,21 +2115,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2093
2115
  if (err) {
2094
2116
  llama_free(lctx);
2095
2117
  llama_free_model(model);
2096
- return std::make_tuple(nullptr, nullptr);
2118
+ return iparams;
2097
2119
  }
2098
2120
  }
2099
2121
 
2100
- for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2101
- const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2102
- float lora_scale = std::get<1>(params.lora_adapter[i]);
2103
- auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2104
- if (adapter == nullptr) {
2105
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2122
+ // load and optionally apply lora adapters
2123
+ for (auto & la : params.lora_adapters) {
2124
+ llama_lora_adapter_container loaded_la;
2125
+ loaded_la.path = la.path;
2126
+ loaded_la.scale = la.scale;
2127
+ loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
2128
+ if (loaded_la.adapter == nullptr) {
2129
+ fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
2106
2130
  llama_free(lctx);
2107
2131
  llama_free_model(model);
2108
- return std::make_tuple(nullptr, nullptr);
2132
+ return iparams;
2109
2133
  }
2110
- llama_lora_adapter_set(lctx, adapter, lora_scale);
2134
+ iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
2135
+ }
2136
+ if (!params.lora_init_without_apply) {
2137
+ llama_lora_adapters_apply(lctx, iparams.lora_adapters);
2111
2138
  }
2112
2139
 
2113
2140
  if (params.ignore_eos) {
@@ -2135,22 +2162,35 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2135
2162
  tmp.clear();
2136
2163
  tmp.push_back(decoder_start_token_id);
2137
2164
  }
2138
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2165
+ if (llama_model_has_decoder(model)) {
2166
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2167
+ }
2139
2168
  llama_kv_cache_clear(lctx);
2140
2169
  llama_synchronize(lctx);
2141
2170
  llama_reset_timings(lctx);
2142
2171
  }
2143
2172
 
2144
- return std::make_tuple(model, lctx);
2173
+ iparams.model = model;
2174
+ iparams.context = lctx;
2175
+ return iparams;
2176
+ }
2177
+
2178
+ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2179
+ llama_lora_adapter_clear(ctx);
2180
+ for (auto & la : lora_adapters) {
2181
+ if (la.scale != 0.0f) {
2182
+ llama_lora_adapter_set(ctx, la.adapter, la.scale);
2183
+ }
2184
+ }
2145
2185
  }
2146
2186
 
2147
2187
  struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
2148
2188
  auto mparams = llama_model_default_params();
2149
2189
 
2150
- mparams.vocab_only = params.vocab_only;
2151
2190
  if (params.n_gpu_layers != -1) {
2152
2191
  mparams.n_gpu_layers = params.n_gpu_layers;
2153
2192
  }
2193
+ mparams.vocab_only = params.vocab_only;
2154
2194
  mparams.rpc_servers = params.rpc_servers.c_str();
2155
2195
  mparams.main_gpu = params.main_gpu;
2156
2196
  mparams.split_mode = params.split_mode;
@@ -3167,19 +3207,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
3167
3207
  }
3168
3208
 
3169
3209
  fprintf(stream, "lora:\n");
3170
- for (std::tuple<std::string, float> la : params.lora_adapter) {
3171
- if (std::get<1>(la) != 1.0f) {
3172
- continue;
3210
+ for (auto & la : params.lora_adapters) {
3211
+ if (la.scale == 1.0f) {
3212
+ fprintf(stream, " - %s\n", la.path.c_str());
3173
3213
  }
3174
- fprintf(stream, " - %s\n", std::get<0>(la).c_str());
3175
3214
  }
3176
3215
  fprintf(stream, "lora_scaled:\n");
3177
- for (std::tuple<std::string, float> la : params.lora_adapter) {
3178
- if (std::get<1>(la) == 1.0f) {
3179
- continue;
3216
+ for (auto & la : params.lora_adapters) {
3217
+ if (la.scale != 1.0f) {
3218
+ fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
3180
3219
  }
3181
- fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
3182
3220
  }
3221
+ fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
3183
3222
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
3184
3223
  fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
3185
3224
  fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
package/cpp/common.h CHANGED
@@ -33,6 +33,15 @@
33
33
 
34
34
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35
35
 
36
+ struct llama_lora_adapter_info {
37
+ std::string path;
38
+ float scale;
39
+ };
40
+
41
+ struct llama_lora_adapter_container : llama_lora_adapter_info {
42
+ struct llama_lora_adapter * adapter;
43
+ };
44
+
36
45
  // build info
37
46
  extern int LLAMA_BUILD_NUMBER;
38
47
  extern char const * LLAMA_COMMIT;
@@ -138,8 +147,8 @@ struct gpt_params {
138
147
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
139
148
  std::vector<llama_model_kv_override> kv_overrides;
140
149
 
141
- // TODO: avoid tuple, use struct
142
- std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
150
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
151
+ std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
143
152
 
144
153
  std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
145
154
 
@@ -289,6 +298,8 @@ std::vector<std::string> string_split(std::string input, char separator);
289
298
  std::string string_strip(const std::string & str);
290
299
  std::string string_get_sortable_timestamp();
291
300
 
301
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
302
+
292
303
  template<class T>
293
304
  static std::vector<T> string_split(const std::string & str, char delim) {
294
305
  std::vector<T> values;
@@ -320,8 +331,13 @@ std::string fs_get_cache_file(const std::string & filename);
320
331
  // Model utils
321
332
  //
322
333
 
323
- // TODO: avoid tuplue, use struct
324
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
334
+ struct llama_init_result {
335
+ struct llama_model * model = nullptr;
336
+ struct llama_context * context = nullptr;
337
+ std::vector<llama_lora_adapter_container> lora_adapters;
338
+ };
339
+
340
+ struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
325
341
 
326
342
  struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
327
343
  struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
@@ -329,6 +345,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
329
345
  struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
330
346
  struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
331
347
 
348
+ // clear LoRA adapters from context, then apply new list of adapters
349
+ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
350
+
332
351
  // Batch utils
333
352
 
334
353
  void llama_batch_clear(struct llama_batch & batch);
@@ -16,6 +16,8 @@
16
16
 
17
17
  #if defined(__GNUC__)
18
18
  #pragma GCC diagnostic ignored "-Woverlength-strings"
19
+ #elif defined(_MSC_VER)
20
+ #pragma warning(disable: 4244 4267) // possible loss of data
19
21
  #endif
20
22
 
21
23
  #define UNUSED LM_GGML_UNUSED
@@ -384,8 +386,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
384
386
  UNUSED(blocklen);
385
387
 
386
388
  #if defined(__ARM_FEATURE_SVE)
387
- if (svcntw() == 8) {
388
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
389
+ if (lm_ggml_sve_cnt_b == QK8_0) {
390
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
389
391
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
390
392
  }
391
393
  #endif
@@ -496,8 +498,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
496
498
  UNUSED(blocklen);
497
499
 
498
500
  #if defined(__ARM_FEATURE_SVE)
499
- if (svcntw() == 8) {
500
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
501
+ if (lm_ggml_sve_cnt_b == QK8_0) {
502
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
501
503
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
502
504
  }
503
505
  #endif
@@ -614,7 +616,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
614
616
  UNUSED(blocklen);
615
617
 
616
618
  #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
617
- if (svcntw() == 8) {
619
+ if (lm_ggml_sve_cnt_b == QK8_0) {
618
620
  const void * b_ptr = vx;
619
621
  const void * a_ptr = vy;
620
622
  float * res_ptr = s;
@@ -680,12 +682,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
680
682
  return;
681
683
  }
682
684
  else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
683
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
685
+ LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
684
686
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
685
687
  "performance");
686
688
  }
687
689
  else if (lm_ggml_cpu_has_neon()) {
688
- LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
690
+ LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
689
691
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
690
692
  "quantization format for optimal performance");
691
693
  }
@@ -745,8 +747,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
745
747
  UNUSED(blocklen);
746
748
 
747
749
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
748
- if (svcntw() == 8) {
749
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
750
+ if (lm_ggml_sve_cnt_b == QK8_0) {
751
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
750
752
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
751
753
  }
752
754
  #endif
@@ -1266,8 +1268,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
1266
1268
  UNUSED(blocklen);
1267
1269
 
1268
1270
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1269
- if (svcntw() == 8) {
1270
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
1271
+ if (lm_ggml_sve_cnt_b == QK8_0) {
1272
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
1271
1273
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1272
1274
  }
1273
1275
  #endif
@@ -1728,7 +1730,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
1728
1730
  UNUSED(blocklen);
1729
1731
 
1730
1732
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1731
- if (svcntw() == 8) {
1733
+ if (lm_ggml_sve_cnt_b == QK8_0) {
1732
1734
  const void * b_ptr = vx;
1733
1735
  const void * a_ptr = vy;
1734
1736
  float * res_ptr = s;
@@ -2139,12 +2141,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2139
2141
  return;
2140
2142
  }
2141
2143
  else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
2142
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
2144
+ LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
2143
2145
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
2144
2146
  "performance");
2145
2147
  }
2146
2148
  else if (lm_ggml_cpu_has_neon()) {
2147
- LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
2149
+ LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
2148
2150
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
2149
2151
  "quantization format for optimal performance");
2150
2152
  }
@@ -351,15 +351,10 @@ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_ba
351
351
  }
352
352
 
353
353
  // an async copy would normally happen after all the queued operations on both backends are completed
354
- // sync src, set_async dst
355
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
356
- lm_ggml_backend_synchronize(backend_src);
357
- lm_ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, lm_ggml_nbytes(src));
358
- } else {
359
- lm_ggml_backend_synchronize(backend_src);
360
- lm_ggml_backend_tensor_copy(src, dst);
361
- lm_ggml_backend_synchronize(backend_dst);
362
- }
354
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
355
+ lm_ggml_backend_synchronize(backend_src);
356
+ lm_ggml_backend_synchronize(backend_dst);
357
+ lm_ggml_backend_tensor_copy(src, dst);
363
358
  }
364
359
 
365
360
  // events
@@ -1782,7 +1777,17 @@ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_
1782
1777
  } else {
1783
1778
  lm_ggml_backend_synchronize(split_backend);
1784
1779
  }
1785
- lm_ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1780
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1781
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1782
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1783
+ lm_ggml_backend_synchronize(input_backend);
1784
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1785
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1786
+ } else {
1787
+ lm_ggml_backend_synchronize(split_backend);
1788
+ }
1789
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1790
+ }
1786
1791
  }
1787
1792
  }
1788
1793
 
package/cpp/ggml-impl.h CHANGED
@@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) {
80
80
  /**
81
81
  * Converts float32 to brain16.
82
82
  *
83
- * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
84
- * Subnormals shall be flushed to zero, and NANs will be quiet.
83
+ * This is binary identical with Google Brain float conversion.
84
+ * Floats shall round to nearest even, and NANs shall be quiet.
85
+ * Subnormals aren't flushed to zero, except perhaps when used.
85
86
  * This code should vectorize nicely if using modern compilers.
86
87
  */
87
88
  static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
@@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
95
96
  h.bits = (u.i >> 16) | 64; /* force to quiet */
96
97
  return h;
97
98
  }
98
- if (!(u.i & 0x7f800000)) { /* subnormal */
99
- h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
100
- return h;
101
- }
102
99
  h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
103
100
  return h;
104
101
  }
@@ -146,6 +143,7 @@ extern "C" {
146
143
 
147
144
  #if defined(__ARM_FEATURE_SVE)
148
145
  #include <arm_sve.h>
146
+ #include <sys/prctl.h>
149
147
  #endif
150
148
 
151
149
  // 16-bit float
package/cpp/ggml-metal.h CHANGED
@@ -50,6 +50,8 @@ LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_f
50
50
 
51
51
  LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb);
52
52
 
53
+ LM_GGML_API void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data);
54
+
53
55
  LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
54
56
 
55
57
  // helper to check if the device supports a specific family
package/cpp/ggml-metal.m CHANGED
@@ -210,7 +210,7 @@ enum lm_ggml_metal_kernel_type {
210
210
  LM_GGML_METAL_KERNEL_TYPE_COUNT
211
211
  };
212
212
 
213
- struct lm_ggml_metal_context {
213
+ struct lm_ggml_backend_metal_context {
214
214
  int n_cb;
215
215
 
216
216
  id<MTLDevice> device;
@@ -224,6 +224,10 @@ struct lm_ggml_metal_context {
224
224
  bool support_simdgroup_mm;
225
225
 
226
226
  bool should_capture_next_compute;
227
+
228
+ // abort lm_ggml_metal_graph_compute if callback returns true
229
+ lm_ggml_abort_callback abort_callback;
230
+ void * abort_callback_data;
227
231
  };
228
232
 
229
233
  // MSL code
@@ -289,7 +293,7 @@ static void * lm_ggml_metal_host_malloc(size_t n) {
289
293
  return data;
290
294
  }
291
295
 
292
- static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
296
+ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
293
297
  LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
294
298
 
295
299
  #if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
@@ -306,7 +310,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
306
310
  LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
307
311
 
308
312
  // Configure context
309
- struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context));
313
+ struct lm_ggml_backend_metal_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_context));
310
314
  ctx->device = device;
311
315
  ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
312
316
  ctx->queue = [ctx->device newCommandQueue];
@@ -668,7 +672,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
668
672
  return ctx;
669
673
  }
670
674
 
671
- static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
675
+ static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) {
672
676
  LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
673
677
 
674
678
  for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@@ -734,7 +738,7 @@ static id<MTLBuffer> lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t
734
738
  return nil;
735
739
  }
736
740
 
737
- static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, const struct lm_ggml_tensor * op) {
741
+ static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) {
738
742
  for (size_t i = 0, n = 3; i < n; ++i) {
739
743
  if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) {
740
744
  return false;
@@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
845
849
  }
846
850
 
847
851
  static enum lm_ggml_status lm_ggml_metal_graph_compute(
848
- struct lm_ggml_metal_context * ctx,
852
+ struct lm_ggml_backend_metal_context * ctx,
849
853
  struct lm_ggml_cgraph * gf) {
850
854
 
851
855
  @autoreleasepool {
@@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
878
882
  id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
879
883
  command_buffer_builder[cb_idx] = command_buffer;
880
884
 
881
- // enqueue the command buffers in order to specify their execution order
882
- [command_buffer enqueue];
885
+ // always enqueue the first two command buffers
886
+ // enqueue all of the command buffers if we don't need to abort
887
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
888
+ [command_buffer enqueue];
889
+ }
883
890
  }
884
891
 
885
892
  const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2229
2236
  LM_GGML_ASSERT(ne00 % 4 == 0);
2230
2237
  LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
2231
2238
 
2232
- //float eps;
2233
- //memcpy(&eps, dst->op_params, sizeof(float));
2234
-
2235
- const float eps = 1e-6f; // TODO: temporarily hardcoded
2239
+ float eps;
2240
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
2236
2241
 
2237
2242
  const int32_t n_groups = ((int32_t *) dst->op_params)[0];
2238
2243
 
@@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2829
2834
 
2830
2835
  [encoder endEncoding];
2831
2836
 
2832
- [command_buffer commit];
2837
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
2838
+ [command_buffer commit];
2839
+ }
2833
2840
  });
2834
2841
 
2835
2842
  // Wait for completion and check status of each command buffer
@@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2849
2856
 
2850
2857
  return LM_GGML_STATUS_FAILED;
2851
2858
  }
2859
+
2860
+ id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
2861
+ if (!next_buffer) {
2862
+ continue;
2863
+ }
2864
+
2865
+ bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
2866
+ if (next_queued) {
2867
+ continue;
2868
+ }
2869
+
2870
+ if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
2871
+ LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
2872
+ return LM_GGML_STATUS_ABORTED;
2873
+ }
2874
+
2875
+ [next_buffer commit];
2852
2876
  }
2853
2877
 
2854
2878
  if (should_capture) {
@@ -3152,7 +3176,7 @@ LM_GGML_CALL static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t ba
3152
3176
  }
3153
3177
 
3154
3178
  LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
3155
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3179
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3156
3180
  lm_ggml_metal_free(ctx);
3157
3181
  free(backend);
3158
3182
  }
@@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa
3164
3188
  }
3165
3189
 
3166
3190
  LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
3167
- struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
3191
+ struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3168
3192
 
3169
3193
  return lm_ggml_metal_graph_compute(metal_ctx, cgraph);
3170
3194
  }
3171
3195
 
3172
3196
  LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
3173
- struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
3197
+ struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3174
3198
 
3175
3199
  return lm_ggml_metal_supports_op(metal_ctx, op);
3176
3200
  }
@@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) {
3215
3239
  }
3216
3240
 
3217
3241
  lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
3218
- struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
3219
-
3242
+ struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
3220
3243
  if (ctx == NULL) {
3244
+ LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
3221
3245
  return NULL;
3222
3246
  }
3223
3247
 
@@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
3239
3263
  void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
3240
3264
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3241
3265
 
3242
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3266
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3243
3267
 
3244
3268
  ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
3245
3269
  }
3246
3270
 
3271
+ void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) {
3272
+ LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3273
+
3274
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3275
+
3276
+ ctx->abort_callback = abort_callback;
3277
+ ctx->abort_callback_data = user_data;
3278
+ }
3279
+
3247
3280
  bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
3248
3281
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3249
3282
 
3250
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3283
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3251
3284
 
3252
3285
  return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
3253
3286
  }
@@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
3255
3288
  void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) {
3256
3289
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3257
3290
 
3258
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3291
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3259
3292
  ctx->should_capture_next_compute = true;
3260
3293
  }
3261
3294
 
package/cpp/ggml-quants.c CHANGED
@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
3818
3818
  float sumf = 0;
3819
3819
 
3820
3820
  #if defined(__ARM_FEATURE_SVE)
3821
- if (svcntb() == QK8_0) {
3821
+ if (lm_ggml_sve_cnt_b == QK8_0) {
3822
3822
  const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
3823
3823
  const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
3824
3824
 
@@ -5303,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
5303
5303
  float sumf = 0;
5304
5304
 
5305
5305
  #if defined(__ARM_FEATURE_SVE)
5306
- if (svcntb() == QK8_0) {
5306
+ if (lm_ggml_sve_cnt_b == QK8_0) {
5307
5307
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
5308
5308
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
5309
5309