cui-llama.rn 1.0.6 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,6 +10,7 @@ The following features have been added for Android:
10
10
  - Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
11
11
  - `vocab_only` mode: utilize the llama.cpp tokenizer
12
12
  - tokenizeSync: non-blocking, synchronous tokenizer function
13
+ - Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
13
14
 
14
15
  Original repo README.md below.
15
16
 
@@ -174,7 +174,7 @@ Java_com_rnllama_LlamaContext_initContext(
174
174
 
175
175
  const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
176
176
  if (lora_chars != nullptr && lora_chars[0] != '\0') {
177
- defaultParams.lora_adapter.push_back({lora_chars, lora_scaled});
177
+ defaultParams.lora_adapters.push_back({lora_chars, lora_scaled});
178
178
  defaultParams.use_mmap = false;
179
179
  }
180
180
 
@@ -211,7 +211,7 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
211
211
  for (int i = 0; i < count; i++) {
212
212
  char key[256];
213
213
  llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
214
- char val[256];
214
+ char val[2048];
215
215
  llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
216
216
 
217
217
  putString(env, meta, key, val);
package/cpp/common.cpp CHANGED
@@ -690,14 +690,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
690
690
  }
691
691
  if (arg == "--lora") {
692
692
  CHECK_ARG
693
- params.lora_adapter.emplace_back(argv[i], 1.0f);
693
+ params.lora_adapters.push_back({
694
+ std::string(argv[i]),
695
+ 1.0,
696
+ });
694
697
  return true;
695
698
  }
696
699
  if (arg == "--lora-scaled") {
697
700
  CHECK_ARG
698
- const char* lora_adapter = argv[i];
701
+ std::string lora_adapter = argv[i];
699
702
  CHECK_ARG
700
- params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
703
+ params.lora_adapters.push_back({
704
+ lora_adapter,
705
+ std::stof(argv[i]),
706
+ });
707
+ return true;
708
+ }
709
+ if (arg == "--lora-init-without-apply") {
710
+ params.lora_init_without_apply = true;
701
711
  return true;
702
712
  }
703
713
  if (arg == "--control-vector") {
@@ -1640,7 +1650,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
1640
1650
  options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
1641
1651
  options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
1642
1652
  options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1643
- options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1653
+ options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
1644
1654
  options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
1645
1655
  options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
1646
1656
  options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
@@ -1660,6 +1670,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
1660
1670
  "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
1661
1671
  options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
1662
1672
  "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1673
+ options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
1663
1674
 
1664
1675
  #ifndef LOG_DISABLE_LOGS
1665
1676
  options.push_back({ "logging" });
@@ -1772,6 +1783,17 @@ std::string string_get_sortable_timestamp() {
1772
1783
  return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
1773
1784
  }
1774
1785
 
1786
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
1787
+ if (search.empty()) {
1788
+ return; // Avoid infinite loop if 'search' is an empty string
1789
+ }
1790
+ size_t pos = 0;
1791
+ while ((pos = s.find(search, pos)) != std::string::npos) {
1792
+ s.replace(pos, search.length(), replace);
1793
+ pos += replace.length();
1794
+ }
1795
+ }
1796
+
1775
1797
  void string_process_escapes(std::string & input) {
1776
1798
  std::size_t input_len = input.length();
1777
1799
  std::size_t output_idx = 0;
@@ -2045,8 +2067,8 @@ std::string fs_get_cache_file(const std::string & filename) {
2045
2067
  //
2046
2068
  // Model utils
2047
2069
  //
2048
-
2049
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2070
+ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2071
+ llama_init_result iparams;
2050
2072
  auto mparams = llama_model_params_from_gpt_params(params);
2051
2073
 
2052
2074
  llama_model * model = nullptr;
@@ -2061,7 +2083,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2061
2083
 
2062
2084
  if (model == NULL) {
2063
2085
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2064
- return std::make_tuple(nullptr, nullptr);
2086
+ return iparams;
2065
2087
  }
2066
2088
 
2067
2089
  auto cparams = llama_context_params_from_gpt_params(params);
@@ -2070,7 +2092,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2070
2092
  if (lctx == NULL) {
2071
2093
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
2072
2094
  llama_free_model(model);
2073
- return std::make_tuple(nullptr, nullptr);
2095
+ return iparams;
2074
2096
  }
2075
2097
 
2076
2098
  if (!params.control_vectors.empty()) {
@@ -2081,7 +2103,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2081
2103
  if (cvec.n_embd == -1) {
2082
2104
  llama_free(lctx);
2083
2105
  llama_free_model(model);
2084
- return std::make_tuple(nullptr, nullptr);
2106
+ return iparams;
2085
2107
  }
2086
2108
 
2087
2109
  int err = llama_control_vector_apply(lctx,
@@ -2093,21 +2115,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2093
2115
  if (err) {
2094
2116
  llama_free(lctx);
2095
2117
  llama_free_model(model);
2096
- return std::make_tuple(nullptr, nullptr);
2118
+ return iparams;
2097
2119
  }
2098
2120
  }
2099
2121
 
2100
- for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2101
- const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2102
- float lora_scale = std::get<1>(params.lora_adapter[i]);
2103
- auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2104
- if (adapter == nullptr) {
2105
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2122
+ // load and optionally apply lora adapters
2123
+ for (auto & la : params.lora_adapters) {
2124
+ llama_lora_adapter_container loaded_la;
2125
+ loaded_la.path = la.path;
2126
+ loaded_la.scale = la.scale;
2127
+ loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
2128
+ if (loaded_la.adapter == nullptr) {
2129
+ fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
2106
2130
  llama_free(lctx);
2107
2131
  llama_free_model(model);
2108
- return std::make_tuple(nullptr, nullptr);
2132
+ return iparams;
2109
2133
  }
2110
- llama_lora_adapter_set(lctx, adapter, lora_scale);
2134
+ iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
2135
+ }
2136
+ if (!params.lora_init_without_apply) {
2137
+ llama_lora_adapters_apply(lctx, iparams.lora_adapters);
2111
2138
  }
2112
2139
 
2113
2140
  if (params.ignore_eos) {
@@ -2135,22 +2162,35 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2135
2162
  tmp.clear();
2136
2163
  tmp.push_back(decoder_start_token_id);
2137
2164
  }
2138
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2165
+ if (llama_model_has_decoder(model)) {
2166
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2167
+ }
2139
2168
  llama_kv_cache_clear(lctx);
2140
2169
  llama_synchronize(lctx);
2141
2170
  llama_reset_timings(lctx);
2142
2171
  }
2143
2172
 
2144
- return std::make_tuple(model, lctx);
2173
+ iparams.model = model;
2174
+ iparams.context = lctx;
2175
+ return iparams;
2176
+ }
2177
+
2178
+ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2179
+ llama_lora_adapter_clear(ctx);
2180
+ for (auto & la : lora_adapters) {
2181
+ if (la.scale != 0.0f) {
2182
+ llama_lora_adapter_set(ctx, la.adapter, la.scale);
2183
+ }
2184
+ }
2145
2185
  }
2146
2186
 
2147
2187
  struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
2148
2188
  auto mparams = llama_model_default_params();
2149
2189
 
2150
- mparams.vocab_only = params.vocab_only;
2151
2190
  if (params.n_gpu_layers != -1) {
2152
2191
  mparams.n_gpu_layers = params.n_gpu_layers;
2153
2192
  }
2193
+ mparams.vocab_only = params.vocab_only;
2154
2194
  mparams.rpc_servers = params.rpc_servers.c_str();
2155
2195
  mparams.main_gpu = params.main_gpu;
2156
2196
  mparams.split_mode = params.split_mode;
@@ -3167,19 +3207,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
3167
3207
  }
3168
3208
 
3169
3209
  fprintf(stream, "lora:\n");
3170
- for (std::tuple<std::string, float> la : params.lora_adapter) {
3171
- if (std::get<1>(la) != 1.0f) {
3172
- continue;
3210
+ for (auto & la : params.lora_adapters) {
3211
+ if (la.scale == 1.0f) {
3212
+ fprintf(stream, " - %s\n", la.path.c_str());
3173
3213
  }
3174
- fprintf(stream, " - %s\n", std::get<0>(la).c_str());
3175
3214
  }
3176
3215
  fprintf(stream, "lora_scaled:\n");
3177
- for (std::tuple<std::string, float> la : params.lora_adapter) {
3178
- if (std::get<1>(la) == 1.0f) {
3179
- continue;
3216
+ for (auto & la : params.lora_adapters) {
3217
+ if (la.scale != 1.0f) {
3218
+ fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
3180
3219
  }
3181
- fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
3182
3220
  }
3221
+ fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
3183
3222
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
3184
3223
  fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
3185
3224
  fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
package/cpp/common.h CHANGED
@@ -33,6 +33,15 @@
33
33
 
34
34
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35
35
 
36
+ struct llama_lora_adapter_info {
37
+ std::string path;
38
+ float scale;
39
+ };
40
+
41
+ struct llama_lora_adapter_container : llama_lora_adapter_info {
42
+ struct llama_lora_adapter * adapter;
43
+ };
44
+
36
45
  // build info
37
46
  extern int LLAMA_BUILD_NUMBER;
38
47
  extern char const * LLAMA_COMMIT;
@@ -138,8 +147,8 @@ struct gpt_params {
138
147
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
139
148
  std::vector<llama_model_kv_override> kv_overrides;
140
149
 
141
- // TODO: avoid tuple, use struct
142
- std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
150
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
151
+ std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
143
152
 
144
153
  std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
145
154
 
@@ -289,6 +298,8 @@ std::vector<std::string> string_split(std::string input, char separator);
289
298
  std::string string_strip(const std::string & str);
290
299
  std::string string_get_sortable_timestamp();
291
300
 
301
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
302
+
292
303
  template<class T>
293
304
  static std::vector<T> string_split(const std::string & str, char delim) {
294
305
  std::vector<T> values;
@@ -320,8 +331,13 @@ std::string fs_get_cache_file(const std::string & filename);
320
331
  // Model utils
321
332
  //
322
333
 
323
- // TODO: avoid tuplue, use struct
324
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
334
+ struct llama_init_result {
335
+ struct llama_model * model = nullptr;
336
+ struct llama_context * context = nullptr;
337
+ std::vector<llama_lora_adapter_container> lora_adapters;
338
+ };
339
+
340
+ struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
325
341
 
326
342
  struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
327
343
  struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
@@ -329,6 +345,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
329
345
  struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
330
346
  struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
331
347
 
348
+ // clear LoRA adapters from context, then apply new list of adapters
349
+ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
350
+
332
351
  // Batch utils
333
352
 
334
353
  void llama_batch_clear(struct llama_batch & batch);
@@ -16,6 +16,8 @@
16
16
 
17
17
  #if defined(__GNUC__)
18
18
  #pragma GCC diagnostic ignored "-Woverlength-strings"
19
+ #elif defined(_MSC_VER)
20
+ #pragma warning(disable: 4244 4267) // possible loss of data
19
21
  #endif
20
22
 
21
23
  #define UNUSED LM_GGML_UNUSED
@@ -384,8 +386,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
384
386
  UNUSED(blocklen);
385
387
 
386
388
  #if defined(__ARM_FEATURE_SVE)
387
- if (svcntw() == 8) {
388
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
389
+ if (lm_ggml_sve_cnt_b == QK8_0) {
390
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
389
391
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
390
392
  }
391
393
  #endif
@@ -496,8 +498,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
496
498
  UNUSED(blocklen);
497
499
 
498
500
  #if defined(__ARM_FEATURE_SVE)
499
- if (svcntw() == 8) {
500
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
501
+ if (lm_ggml_sve_cnt_b == QK8_0) {
502
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
501
503
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
502
504
  }
503
505
  #endif
@@ -614,7 +616,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
614
616
  UNUSED(blocklen);
615
617
 
616
618
  #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
617
- if (svcntw() == 8) {
619
+ if (lm_ggml_sve_cnt_b == QK8_0) {
618
620
  const void * b_ptr = vx;
619
621
  const void * a_ptr = vy;
620
622
  float * res_ptr = s;
@@ -680,12 +682,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
680
682
  return;
681
683
  }
682
684
  else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
683
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
685
+ LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
684
686
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
685
687
  "performance");
686
688
  }
687
689
  else if (lm_ggml_cpu_has_neon()) {
688
- LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
690
+ LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
689
691
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
690
692
  "quantization format for optimal performance");
691
693
  }
@@ -745,8 +747,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
745
747
  UNUSED(blocklen);
746
748
 
747
749
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
748
- if (svcntw() == 8) {
749
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
750
+ if (lm_ggml_sve_cnt_b == QK8_0) {
751
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
750
752
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
751
753
  }
752
754
  #endif
@@ -1266,8 +1268,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
1266
1268
  UNUSED(blocklen);
1267
1269
 
1268
1270
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1269
- if (svcntw() == 8) {
1270
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
1271
+ if (lm_ggml_sve_cnt_b == QK8_0) {
1272
+ LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
1271
1273
  "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1272
1274
  }
1273
1275
  #endif
@@ -1728,7 +1730,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
1728
1730
  UNUSED(blocklen);
1729
1731
 
1730
1732
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1731
- if (svcntw() == 8) {
1733
+ if (lm_ggml_sve_cnt_b == QK8_0) {
1732
1734
  const void * b_ptr = vx;
1733
1735
  const void * a_ptr = vy;
1734
1736
  float * res_ptr = s;
@@ -2139,12 +2141,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2139
2141
  return;
2140
2142
  }
2141
2143
  else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
2142
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
2144
+ LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
2143
2145
  "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
2144
2146
  "performance");
2145
2147
  }
2146
2148
  else if (lm_ggml_cpu_has_neon()) {
2147
- LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
2149
+ LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
2148
2150
  "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
2149
2151
  "quantization format for optimal performance");
2150
2152
  }
@@ -351,15 +351,10 @@ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_ba
351
351
  }
352
352
 
353
353
  // an async copy would normally happen after all the queued operations on both backends are completed
354
- // sync src, set_async dst
355
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
356
- lm_ggml_backend_synchronize(backend_src);
357
- lm_ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, lm_ggml_nbytes(src));
358
- } else {
359
- lm_ggml_backend_synchronize(backend_src);
360
- lm_ggml_backend_tensor_copy(src, dst);
361
- lm_ggml_backend_synchronize(backend_dst);
362
- }
354
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
355
+ lm_ggml_backend_synchronize(backend_src);
356
+ lm_ggml_backend_synchronize(backend_dst);
357
+ lm_ggml_backend_tensor_copy(src, dst);
363
358
  }
364
359
 
365
360
  // events
@@ -1782,7 +1777,17 @@ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_
1782
1777
  } else {
1783
1778
  lm_ggml_backend_synchronize(split_backend);
1784
1779
  }
1785
- lm_ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1780
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1781
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1782
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1783
+ lm_ggml_backend_synchronize(input_backend);
1784
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1785
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1786
+ } else {
1787
+ lm_ggml_backend_synchronize(split_backend);
1788
+ }
1789
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1790
+ }
1786
1791
  }
1787
1792
  }
1788
1793
 
package/cpp/ggml-impl.h CHANGED
@@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) {
80
80
  /**
81
81
  * Converts float32 to brain16.
82
82
  *
83
- * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
84
- * Subnormals shall be flushed to zero, and NANs will be quiet.
83
+ * This is binary identical with Google Brain float conversion.
84
+ * Floats shall round to nearest even, and NANs shall be quiet.
85
+ * Subnormals aren't flushed to zero, except perhaps when used.
85
86
  * This code should vectorize nicely if using modern compilers.
86
87
  */
87
88
  static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
@@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
95
96
  h.bits = (u.i >> 16) | 64; /* force to quiet */
96
97
  return h;
97
98
  }
98
- if (!(u.i & 0x7f800000)) { /* subnormal */
99
- h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
100
- return h;
101
- }
102
99
  h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
103
100
  return h;
104
101
  }
@@ -146,6 +143,7 @@ extern "C" {
146
143
 
147
144
  #if defined(__ARM_FEATURE_SVE)
148
145
  #include <arm_sve.h>
146
+ #include <sys/prctl.h>
149
147
  #endif
150
148
 
151
149
  // 16-bit float
package/cpp/ggml-metal.h CHANGED
@@ -50,6 +50,8 @@ LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_f
50
50
 
51
51
  LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb);
52
52
 
53
+ LM_GGML_API void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data);
54
+
53
55
  LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
54
56
 
55
57
  // helper to check if the device supports a specific family