cui-llama.rn 1.0.7 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +1 -1
- package/cpp/common.cpp +67 -28
- package/cpp/common.h +23 -4
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +54 -21
- package/cpp/ggml-quants.c +2 -2
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +37 -12
- package/cpp/ggml.h +6 -4
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-vocab.cpp +10 -16
- package/cpp/llama-vocab.h +2 -0
- package/cpp/llama.cpp +432 -265
- package/cpp/llama.h +4 -1
- package/cpp/rn-llama.hpp +3 -4
- package/package.json +1 -1
package/android/src/main/jni.cpp
CHANGED
@@ -174,7 +174,7 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
174
174
|
|
175
175
|
const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
|
176
176
|
if (lora_chars != nullptr && lora_chars[0] != '\0') {
|
177
|
-
defaultParams.
|
177
|
+
defaultParams.lora_adapters.push_back({lora_chars, lora_scaled});
|
178
178
|
defaultParams.use_mmap = false;
|
179
179
|
}
|
180
180
|
|
package/cpp/common.cpp
CHANGED
@@ -690,14 +690,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
690
690
|
}
|
691
691
|
if (arg == "--lora") {
|
692
692
|
CHECK_ARG
|
693
|
-
params.
|
693
|
+
params.lora_adapters.push_back({
|
694
|
+
std::string(argv[i]),
|
695
|
+
1.0,
|
696
|
+
});
|
694
697
|
return true;
|
695
698
|
}
|
696
699
|
if (arg == "--lora-scaled") {
|
697
700
|
CHECK_ARG
|
698
|
-
|
701
|
+
std::string lora_adapter = argv[i];
|
699
702
|
CHECK_ARG
|
700
|
-
params.
|
703
|
+
params.lora_adapters.push_back({
|
704
|
+
lora_adapter,
|
705
|
+
std::stof(argv[i]),
|
706
|
+
});
|
707
|
+
return true;
|
708
|
+
}
|
709
|
+
if (arg == "--lora-init-without-apply") {
|
710
|
+
params.lora_init_without_apply = true;
|
701
711
|
return true;
|
702
712
|
}
|
703
713
|
if (arg == "--control-vector") {
|
@@ -1660,6 +1670,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|
1660
1670
|
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
1661
1671
|
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
1662
1672
|
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
1673
|
+
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
|
1663
1674
|
|
1664
1675
|
#ifndef LOG_DISABLE_LOGS
|
1665
1676
|
options.push_back({ "logging" });
|
@@ -1772,6 +1783,17 @@ std::string string_get_sortable_timestamp() {
|
|
1772
1783
|
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
1773
1784
|
}
|
1774
1785
|
|
1786
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
1787
|
+
if (search.empty()) {
|
1788
|
+
return; // Avoid infinite loop if 'search' is an empty string
|
1789
|
+
}
|
1790
|
+
size_t pos = 0;
|
1791
|
+
while ((pos = s.find(search, pos)) != std::string::npos) {
|
1792
|
+
s.replace(pos, search.length(), replace);
|
1793
|
+
pos += replace.length();
|
1794
|
+
}
|
1795
|
+
}
|
1796
|
+
|
1775
1797
|
void string_process_escapes(std::string & input) {
|
1776
1798
|
std::size_t input_len = input.length();
|
1777
1799
|
std::size_t output_idx = 0;
|
@@ -2045,8 +2067,8 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
2045
2067
|
//
|
2046
2068
|
// Model utils
|
2047
2069
|
//
|
2048
|
-
|
2049
|
-
|
2070
|
+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
2071
|
+
llama_init_result iparams;
|
2050
2072
|
auto mparams = llama_model_params_from_gpt_params(params);
|
2051
2073
|
|
2052
2074
|
llama_model * model = nullptr;
|
@@ -2061,7 +2083,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2061
2083
|
|
2062
2084
|
if (model == NULL) {
|
2063
2085
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
2064
|
-
return
|
2086
|
+
return iparams;
|
2065
2087
|
}
|
2066
2088
|
|
2067
2089
|
auto cparams = llama_context_params_from_gpt_params(params);
|
@@ -2070,7 +2092,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2070
2092
|
if (lctx == NULL) {
|
2071
2093
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
2072
2094
|
llama_free_model(model);
|
2073
|
-
return
|
2095
|
+
return iparams;
|
2074
2096
|
}
|
2075
2097
|
|
2076
2098
|
if (!params.control_vectors.empty()) {
|
@@ -2081,7 +2103,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2081
2103
|
if (cvec.n_embd == -1) {
|
2082
2104
|
llama_free(lctx);
|
2083
2105
|
llama_free_model(model);
|
2084
|
-
return
|
2106
|
+
return iparams;
|
2085
2107
|
}
|
2086
2108
|
|
2087
2109
|
int err = llama_control_vector_apply(lctx,
|
@@ -2093,21 +2115,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2093
2115
|
if (err) {
|
2094
2116
|
llama_free(lctx);
|
2095
2117
|
llama_free_model(model);
|
2096
|
-
return
|
2118
|
+
return iparams;
|
2097
2119
|
}
|
2098
2120
|
}
|
2099
2121
|
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2122
|
+
// load and optionally apply lora adapters
|
2123
|
+
for (auto & la : params.lora_adapters) {
|
2124
|
+
llama_lora_adapter_container loaded_la;
|
2125
|
+
loaded_la.path = la.path;
|
2126
|
+
loaded_la.scale = la.scale;
|
2127
|
+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
2128
|
+
if (loaded_la.adapter == nullptr) {
|
2129
|
+
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
2106
2130
|
llama_free(lctx);
|
2107
2131
|
llama_free_model(model);
|
2108
|
-
return
|
2132
|
+
return iparams;
|
2109
2133
|
}
|
2110
|
-
|
2134
|
+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
2135
|
+
}
|
2136
|
+
if (!params.lora_init_without_apply) {
|
2137
|
+
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
2111
2138
|
}
|
2112
2139
|
|
2113
2140
|
if (params.ignore_eos) {
|
@@ -2135,22 +2162,35 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2135
2162
|
tmp.clear();
|
2136
2163
|
tmp.push_back(decoder_start_token_id);
|
2137
2164
|
}
|
2138
|
-
|
2165
|
+
if (llama_model_has_decoder(model)) {
|
2166
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
2167
|
+
}
|
2139
2168
|
llama_kv_cache_clear(lctx);
|
2140
2169
|
llama_synchronize(lctx);
|
2141
2170
|
llama_reset_timings(lctx);
|
2142
2171
|
}
|
2143
2172
|
|
2144
|
-
|
2173
|
+
iparams.model = model;
|
2174
|
+
iparams.context = lctx;
|
2175
|
+
return iparams;
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
|
2179
|
+
llama_lora_adapter_clear(ctx);
|
2180
|
+
for (auto & la : lora_adapters) {
|
2181
|
+
if (la.scale != 0.0f) {
|
2182
|
+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
2183
|
+
}
|
2184
|
+
}
|
2145
2185
|
}
|
2146
2186
|
|
2147
2187
|
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
2148
2188
|
auto mparams = llama_model_default_params();
|
2149
2189
|
|
2150
|
-
mparams.vocab_only = params.vocab_only;
|
2151
2190
|
if (params.n_gpu_layers != -1) {
|
2152
2191
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
2153
2192
|
}
|
2193
|
+
mparams.vocab_only = params.vocab_only;
|
2154
2194
|
mparams.rpc_servers = params.rpc_servers.c_str();
|
2155
2195
|
mparams.main_gpu = params.main_gpu;
|
2156
2196
|
mparams.split_mode = params.split_mode;
|
@@ -3167,19 +3207,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3167
3207
|
}
|
3168
3208
|
|
3169
3209
|
fprintf(stream, "lora:\n");
|
3170
|
-
for (
|
3171
|
-
if (
|
3172
|
-
|
3210
|
+
for (auto & la : params.lora_adapters) {
|
3211
|
+
if (la.scale == 1.0f) {
|
3212
|
+
fprintf(stream, " - %s\n", la.path.c_str());
|
3173
3213
|
}
|
3174
|
-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
3175
3214
|
}
|
3176
3215
|
fprintf(stream, "lora_scaled:\n");
|
3177
|
-
for (
|
3178
|
-
if (
|
3179
|
-
|
3216
|
+
for (auto & la : params.lora_adapters) {
|
3217
|
+
if (la.scale != 1.0f) {
|
3218
|
+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
3180
3219
|
}
|
3181
|
-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
3182
3220
|
}
|
3221
|
+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
3183
3222
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
3184
3223
|
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
3185
3224
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
package/cpp/common.h
CHANGED
@@ -33,6 +33,15 @@
|
|
33
33
|
|
34
34
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
35
35
|
|
36
|
+
struct llama_lora_adapter_info {
|
37
|
+
std::string path;
|
38
|
+
float scale;
|
39
|
+
};
|
40
|
+
|
41
|
+
struct llama_lora_adapter_container : llama_lora_adapter_info {
|
42
|
+
struct llama_lora_adapter * adapter;
|
43
|
+
};
|
44
|
+
|
36
45
|
// build info
|
37
46
|
extern int LLAMA_BUILD_NUMBER;
|
38
47
|
extern char const * LLAMA_COMMIT;
|
@@ -138,8 +147,8 @@ struct gpt_params {
|
|
138
147
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
139
148
|
std::vector<llama_model_kv_override> kv_overrides;
|
140
149
|
|
141
|
-
//
|
142
|
-
std::vector<
|
150
|
+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
151
|
+
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
143
152
|
|
144
153
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
145
154
|
|
@@ -289,6 +298,8 @@ std::vector<std::string> string_split(std::string input, char separator);
|
|
289
298
|
std::string string_strip(const std::string & str);
|
290
299
|
std::string string_get_sortable_timestamp();
|
291
300
|
|
301
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
302
|
+
|
292
303
|
template<class T>
|
293
304
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
294
305
|
std::vector<T> values;
|
@@ -320,8 +331,13 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
320
331
|
// Model utils
|
321
332
|
//
|
322
333
|
|
323
|
-
|
324
|
-
|
334
|
+
struct llama_init_result {
|
335
|
+
struct llama_model * model = nullptr;
|
336
|
+
struct llama_context * context = nullptr;
|
337
|
+
std::vector<llama_lora_adapter_container> lora_adapters;
|
338
|
+
};
|
339
|
+
|
340
|
+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
325
341
|
|
326
342
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
327
343
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
@@ -329,6 +345,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
329
345
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
330
346
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
331
347
|
|
348
|
+
// clear LoRA adapters from context, then apply new list of adapters
|
349
|
+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
|
350
|
+
|
332
351
|
// Batch utils
|
333
352
|
|
334
353
|
void llama_batch_clear(struct llama_batch & batch);
|
package/cpp/ggml-aarch64.c
CHANGED
@@ -16,6 +16,8 @@
|
|
16
16
|
|
17
17
|
#if defined(__GNUC__)
|
18
18
|
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
19
|
+
#elif defined(_MSC_VER)
|
20
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
19
21
|
#endif
|
20
22
|
|
21
23
|
#define UNUSED LM_GGML_UNUSED
|
@@ -384,8 +386,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
|
|
384
386
|
UNUSED(blocklen);
|
385
387
|
|
386
388
|
#if defined(__ARM_FEATURE_SVE)
|
387
|
-
if (
|
388
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
389
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
390
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
389
391
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
390
392
|
}
|
391
393
|
#endif
|
@@ -496,8 +498,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
496
498
|
UNUSED(blocklen);
|
497
499
|
|
498
500
|
#if defined(__ARM_FEATURE_SVE)
|
499
|
-
if (
|
500
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
501
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
502
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
501
503
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
502
504
|
}
|
503
505
|
#endif
|
@@ -614,7 +616,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
614
616
|
UNUSED(blocklen);
|
615
617
|
|
616
618
|
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
617
|
-
if (
|
619
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
618
620
|
const void * b_ptr = vx;
|
619
621
|
const void * a_ptr = vy;
|
620
622
|
float * res_ptr = s;
|
@@ -680,12 +682,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
680
682
|
return;
|
681
683
|
}
|
682
684
|
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
|
683
|
-
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (
|
685
|
+
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
684
686
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
685
687
|
"performance");
|
686
688
|
}
|
687
689
|
else if (lm_ggml_cpu_has_neon()) {
|
688
|
-
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (
|
690
|
+
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
|
689
691
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
690
692
|
"quantization format for optimal performance");
|
691
693
|
}
|
@@ -745,8 +747,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
|
|
745
747
|
UNUSED(blocklen);
|
746
748
|
|
747
749
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
748
|
-
if (
|
749
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
750
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
751
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
750
752
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
751
753
|
}
|
752
754
|
#endif
|
@@ -1266,8 +1268,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
1266
1268
|
UNUSED(blocklen);
|
1267
1269
|
|
1268
1270
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
1269
|
-
if (
|
1270
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
1271
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
1272
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
1271
1273
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
1272
1274
|
}
|
1273
1275
|
#endif
|
@@ -1728,7 +1730,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
1728
1730
|
UNUSED(blocklen);
|
1729
1731
|
|
1730
1732
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
1731
|
-
if (
|
1733
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
1732
1734
|
const void * b_ptr = vx;
|
1733
1735
|
const void * a_ptr = vy;
|
1734
1736
|
float * res_ptr = s;
|
@@ -2139,12 +2141,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
2139
2141
|
return;
|
2140
2142
|
}
|
2141
2143
|
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
|
2142
|
-
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (
|
2144
|
+
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
2143
2145
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
2144
2146
|
"performance");
|
2145
2147
|
}
|
2146
2148
|
else if (lm_ggml_cpu_has_neon()) {
|
2147
|
-
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (
|
2149
|
+
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
|
2148
2150
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
2149
2151
|
"quantization format for optimal performance");
|
2150
2152
|
}
|
package/cpp/ggml-backend.c
CHANGED
@@ -351,15 +351,10 @@ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_ba
|
|
351
351
|
}
|
352
352
|
|
353
353
|
// an async copy would normally happen after all the queued operations on both backends are completed
|
354
|
-
//
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
} else {
|
359
|
-
lm_ggml_backend_synchronize(backend_src);
|
360
|
-
lm_ggml_backend_tensor_copy(src, dst);
|
361
|
-
lm_ggml_backend_synchronize(backend_dst);
|
362
|
-
}
|
354
|
+
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
355
|
+
lm_ggml_backend_synchronize(backend_src);
|
356
|
+
lm_ggml_backend_synchronize(backend_dst);
|
357
|
+
lm_ggml_backend_tensor_copy(src, dst);
|
363
358
|
}
|
364
359
|
|
365
360
|
// events
|
@@ -1782,7 +1777,17 @@ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_
|
|
1782
1777
|
} else {
|
1783
1778
|
lm_ggml_backend_synchronize(split_backend);
|
1784
1779
|
}
|
1785
|
-
|
1780
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
1781
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
1782
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
1783
|
+
lm_ggml_backend_synchronize(input_backend);
|
1784
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1785
|
+
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1786
|
+
} else {
|
1787
|
+
lm_ggml_backend_synchronize(split_backend);
|
1788
|
+
}
|
1789
|
+
lm_ggml_backend_tensor_copy(input, input_cpy);
|
1790
|
+
}
|
1786
1791
|
}
|
1787
1792
|
}
|
1788
1793
|
|
package/cpp/ggml-impl.h
CHANGED
@@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) {
|
|
80
80
|
/**
|
81
81
|
* Converts float32 to brain16.
|
82
82
|
*
|
83
|
-
* This
|
84
|
-
*
|
83
|
+
* This is binary identical with Google Brain float conversion.
|
84
|
+
* Floats shall round to nearest even, and NANs shall be quiet.
|
85
|
+
* Subnormals aren't flushed to zero, except perhaps when used.
|
85
86
|
* This code should vectorize nicely if using modern compilers.
|
86
87
|
*/
|
87
88
|
static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
|
@@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
|
|
95
96
|
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
96
97
|
return h;
|
97
98
|
}
|
98
|
-
if (!(u.i & 0x7f800000)) { /* subnormal */
|
99
|
-
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
100
|
-
return h;
|
101
|
-
}
|
102
99
|
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
103
100
|
return h;
|
104
101
|
}
|
@@ -146,6 +143,7 @@ extern "C" {
|
|
146
143
|
|
147
144
|
#if defined(__ARM_FEATURE_SVE)
|
148
145
|
#include <arm_sve.h>
|
146
|
+
#include <sys/prctl.h>
|
149
147
|
#endif
|
150
148
|
|
151
149
|
// 16-bit float
|
package/cpp/ggml-metal.h
CHANGED
@@ -50,6 +50,8 @@ LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_f
|
|
50
50
|
|
51
51
|
LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb);
|
52
52
|
|
53
|
+
LM_GGML_API void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data);
|
54
|
+
|
53
55
|
LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
|
54
56
|
|
55
57
|
// helper to check if the device supports a specific family
|
package/cpp/ggml-metal.m
CHANGED
@@ -210,7 +210,7 @@ enum lm_ggml_metal_kernel_type {
|
|
210
210
|
LM_GGML_METAL_KERNEL_TYPE_COUNT
|
211
211
|
};
|
212
212
|
|
213
|
-
struct
|
213
|
+
struct lm_ggml_backend_metal_context {
|
214
214
|
int n_cb;
|
215
215
|
|
216
216
|
id<MTLDevice> device;
|
@@ -224,6 +224,10 @@ struct lm_ggml_metal_context {
|
|
224
224
|
bool support_simdgroup_mm;
|
225
225
|
|
226
226
|
bool should_capture_next_compute;
|
227
|
+
|
228
|
+
// abort lm_ggml_metal_graph_compute if callback returns true
|
229
|
+
lm_ggml_abort_callback abort_callback;
|
230
|
+
void * abort_callback_data;
|
227
231
|
};
|
228
232
|
|
229
233
|
// MSL code
|
@@ -289,7 +293,7 @@ static void * lm_ggml_metal_host_malloc(size_t n) {
|
|
289
293
|
return data;
|
290
294
|
}
|
291
295
|
|
292
|
-
static struct
|
296
|
+
static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
|
293
297
|
LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
294
298
|
|
295
299
|
#if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
|
@@ -306,7 +310,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
306
310
|
LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
307
311
|
|
308
312
|
// Configure context
|
309
|
-
struct
|
313
|
+
struct lm_ggml_backend_metal_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_context));
|
310
314
|
ctx->device = device;
|
311
315
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
312
316
|
ctx->queue = [ctx->device newCommandQueue];
|
@@ -668,7 +672,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
668
672
|
return ctx;
|
669
673
|
}
|
670
674
|
|
671
|
-
static void lm_ggml_metal_free(struct
|
675
|
+
static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) {
|
672
676
|
LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
673
677
|
|
674
678
|
for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
@@ -734,7 +738,7 @@ static id<MTLBuffer> lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t
|
|
734
738
|
return nil;
|
735
739
|
}
|
736
740
|
|
737
|
-
static bool lm_ggml_metal_supports_op(const struct
|
741
|
+
static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) {
|
738
742
|
for (size_t i = 0, n = 3; i < n; ++i) {
|
739
743
|
if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) {
|
740
744
|
return false;
|
@@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
|
|
845
849
|
}
|
846
850
|
|
847
851
|
static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
848
|
-
struct
|
852
|
+
struct lm_ggml_backend_metal_context * ctx,
|
849
853
|
struct lm_ggml_cgraph * gf) {
|
850
854
|
|
851
855
|
@autoreleasepool {
|
@@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
878
882
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
879
883
|
command_buffer_builder[cb_idx] = command_buffer;
|
880
884
|
|
881
|
-
// enqueue the command buffers
|
882
|
-
|
885
|
+
// always enqueue the first two command buffers
|
886
|
+
// enqueue all of the command buffers if we don't need to abort
|
887
|
+
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
888
|
+
[command_buffer enqueue];
|
889
|
+
}
|
883
890
|
}
|
884
891
|
|
885
892
|
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
@@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2229
2236
|
LM_GGML_ASSERT(ne00 % 4 == 0);
|
2230
2237
|
LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
|
2231
2238
|
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2235
|
-
const float eps = 1e-6f; // TODO: temporarily hardcoded
|
2239
|
+
float eps;
|
2240
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
2236
2241
|
|
2237
2242
|
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
|
2238
2243
|
|
@@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2829
2834
|
|
2830
2835
|
[encoder endEncoding];
|
2831
2836
|
|
2832
|
-
|
2837
|
+
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
2838
|
+
[command_buffer commit];
|
2839
|
+
}
|
2833
2840
|
});
|
2834
2841
|
|
2835
2842
|
// Wait for completion and check status of each command buffer
|
@@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2849
2856
|
|
2850
2857
|
return LM_GGML_STATUS_FAILED;
|
2851
2858
|
}
|
2859
|
+
|
2860
|
+
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
|
2861
|
+
if (!next_buffer) {
|
2862
|
+
continue;
|
2863
|
+
}
|
2864
|
+
|
2865
|
+
bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
2866
|
+
if (next_queued) {
|
2867
|
+
continue;
|
2868
|
+
}
|
2869
|
+
|
2870
|
+
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
2871
|
+
LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
2872
|
+
return LM_GGML_STATUS_ABORTED;
|
2873
|
+
}
|
2874
|
+
|
2875
|
+
[next_buffer commit];
|
2852
2876
|
}
|
2853
2877
|
|
2854
2878
|
if (should_capture) {
|
@@ -3152,7 +3176,7 @@ LM_GGML_CALL static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t ba
|
|
3152
3176
|
}
|
3153
3177
|
|
3154
3178
|
LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
|
3155
|
-
struct
|
3179
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3156
3180
|
lm_ggml_metal_free(ctx);
|
3157
3181
|
free(backend);
|
3158
3182
|
}
|
@@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa
|
|
3164
3188
|
}
|
3165
3189
|
|
3166
3190
|
LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
|
3167
|
-
struct
|
3191
|
+
struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3168
3192
|
|
3169
3193
|
return lm_ggml_metal_graph_compute(metal_ctx, cgraph);
|
3170
3194
|
}
|
3171
3195
|
|
3172
3196
|
LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
3173
|
-
struct
|
3197
|
+
struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3174
3198
|
|
3175
3199
|
return lm_ggml_metal_supports_op(metal_ctx, op);
|
3176
3200
|
}
|
@@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) {
|
|
3215
3239
|
}
|
3216
3240
|
|
3217
3241
|
lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
|
3218
|
-
struct
|
3219
|
-
|
3242
|
+
struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
|
3220
3243
|
if (ctx == NULL) {
|
3244
|
+
LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
3221
3245
|
return NULL;
|
3222
3246
|
}
|
3223
3247
|
|
@@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
|
|
3239
3263
|
void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
|
3240
3264
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3241
3265
|
|
3242
|
-
struct
|
3266
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3243
3267
|
|
3244
3268
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
3245
3269
|
}
|
3246
3270
|
|
3271
|
+
void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) {
|
3272
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3273
|
+
|
3274
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3275
|
+
|
3276
|
+
ctx->abort_callback = abort_callback;
|
3277
|
+
ctx->abort_callback_data = user_data;
|
3278
|
+
}
|
3279
|
+
|
3247
3280
|
bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
|
3248
3281
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3249
3282
|
|
3250
|
-
struct
|
3283
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3251
3284
|
|
3252
3285
|
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
3253
3286
|
}
|
@@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
|
|
3255
3288
|
void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) {
|
3256
3289
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3257
3290
|
|
3258
|
-
struct
|
3291
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3259
3292
|
ctx->should_capture_next_compute = true;
|
3260
3293
|
}
|
3261
3294
|
|
package/cpp/ggml-quants.c
CHANGED
@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
|
|
3818
3818
|
float sumf = 0;
|
3819
3819
|
|
3820
3820
|
#if defined(__ARM_FEATURE_SVE)
|
3821
|
-
if (
|
3821
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
3822
3822
|
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
3823
3823
|
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
3824
3824
|
|
@@ -5303,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
|
|
5303
5303
|
float sumf = 0;
|
5304
5304
|
|
5305
5305
|
#if defined(__ARM_FEATURE_SVE)
|
5306
|
-
if (
|
5306
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
5307
5307
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
5308
5308
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
5309
5309
|
|