cui-llama.rn 1.0.6 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/android/src/main/jni.cpp +2 -2
- package/cpp/common.cpp +68 -29
- package/cpp/common.h +23 -4
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +54 -21
- package/cpp/ggml-quants.c +8 -8
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +81 -12
- package/cpp/ggml.h +6 -4
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-vocab.cpp +10 -16
- package/cpp/llama-vocab.h +2 -0
- package/cpp/llama.cpp +434 -265
- package/cpp/llama.h +4 -1
- package/cpp/rn-llama.hpp +7 -6
- package/ios/RNLlamaContext.mm +1 -1
- package/jest/mock.js +3 -0
- package/package.json +1 -1
package/README.md
CHANGED
@@ -10,6 +10,7 @@ The following features have been added for Android:
|
|
10
10
|
- Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
|
11
11
|
- `vocab_only` mode: utilize the llama.cpp tokenizer
|
12
12
|
- tokenizeSync: non-blocking, synchronous tokenizer function
|
13
|
+
- Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
|
13
14
|
|
14
15
|
Original repo README.md below.
|
15
16
|
|
package/android/src/main/jni.cpp
CHANGED
@@ -174,7 +174,7 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
174
174
|
|
175
175
|
const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
|
176
176
|
if (lora_chars != nullptr && lora_chars[0] != '\0') {
|
177
|
-
defaultParams.
|
177
|
+
defaultParams.lora_adapters.push_back({lora_chars, lora_scaled});
|
178
178
|
defaultParams.use_mmap = false;
|
179
179
|
}
|
180
180
|
|
@@ -211,7 +211,7 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
|
|
211
211
|
for (int i = 0; i < count; i++) {
|
212
212
|
char key[256];
|
213
213
|
llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
|
214
|
-
char val[
|
214
|
+
char val[2048];
|
215
215
|
llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
|
216
216
|
|
217
217
|
putString(env, meta, key, val);
|
package/cpp/common.cpp
CHANGED
@@ -690,14 +690,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
690
690
|
}
|
691
691
|
if (arg == "--lora") {
|
692
692
|
CHECK_ARG
|
693
|
-
params.
|
693
|
+
params.lora_adapters.push_back({
|
694
|
+
std::string(argv[i]),
|
695
|
+
1.0,
|
696
|
+
});
|
694
697
|
return true;
|
695
698
|
}
|
696
699
|
if (arg == "--lora-scaled") {
|
697
700
|
CHECK_ARG
|
698
|
-
|
701
|
+
std::string lora_adapter = argv[i];
|
699
702
|
CHECK_ARG
|
700
|
-
params.
|
703
|
+
params.lora_adapters.push_back({
|
704
|
+
lora_adapter,
|
705
|
+
std::stof(argv[i]),
|
706
|
+
});
|
707
|
+
return true;
|
708
|
+
}
|
709
|
+
if (arg == "--lora-init-without-apply") {
|
710
|
+
params.lora_init_without_apply = true;
|
701
711
|
return true;
|
702
712
|
}
|
703
713
|
if (arg == "--control-vector") {
|
@@ -1640,7 +1650,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|
1640
1650
|
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
1641
1651
|
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
1642
1652
|
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
1643
|
-
options.push_back({ "server", " --embedding(s)", "
|
1653
|
+
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
|
1644
1654
|
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
1645
1655
|
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
1646
1656
|
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
@@ -1660,6 +1670,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|
1660
1670
|
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
1661
1671
|
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
1662
1672
|
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
1673
|
+
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
|
1663
1674
|
|
1664
1675
|
#ifndef LOG_DISABLE_LOGS
|
1665
1676
|
options.push_back({ "logging" });
|
@@ -1772,6 +1783,17 @@ std::string string_get_sortable_timestamp() {
|
|
1772
1783
|
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
1773
1784
|
}
|
1774
1785
|
|
1786
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
1787
|
+
if (search.empty()) {
|
1788
|
+
return; // Avoid infinite loop if 'search' is an empty string
|
1789
|
+
}
|
1790
|
+
size_t pos = 0;
|
1791
|
+
while ((pos = s.find(search, pos)) != std::string::npos) {
|
1792
|
+
s.replace(pos, search.length(), replace);
|
1793
|
+
pos += replace.length();
|
1794
|
+
}
|
1795
|
+
}
|
1796
|
+
|
1775
1797
|
void string_process_escapes(std::string & input) {
|
1776
1798
|
std::size_t input_len = input.length();
|
1777
1799
|
std::size_t output_idx = 0;
|
@@ -2045,8 +2067,8 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
2045
2067
|
//
|
2046
2068
|
// Model utils
|
2047
2069
|
//
|
2048
|
-
|
2049
|
-
|
2070
|
+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
2071
|
+
llama_init_result iparams;
|
2050
2072
|
auto mparams = llama_model_params_from_gpt_params(params);
|
2051
2073
|
|
2052
2074
|
llama_model * model = nullptr;
|
@@ -2061,7 +2083,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2061
2083
|
|
2062
2084
|
if (model == NULL) {
|
2063
2085
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
2064
|
-
return
|
2086
|
+
return iparams;
|
2065
2087
|
}
|
2066
2088
|
|
2067
2089
|
auto cparams = llama_context_params_from_gpt_params(params);
|
@@ -2070,7 +2092,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2070
2092
|
if (lctx == NULL) {
|
2071
2093
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
2072
2094
|
llama_free_model(model);
|
2073
|
-
return
|
2095
|
+
return iparams;
|
2074
2096
|
}
|
2075
2097
|
|
2076
2098
|
if (!params.control_vectors.empty()) {
|
@@ -2081,7 +2103,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2081
2103
|
if (cvec.n_embd == -1) {
|
2082
2104
|
llama_free(lctx);
|
2083
2105
|
llama_free_model(model);
|
2084
|
-
return
|
2106
|
+
return iparams;
|
2085
2107
|
}
|
2086
2108
|
|
2087
2109
|
int err = llama_control_vector_apply(lctx,
|
@@ -2093,21 +2115,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2093
2115
|
if (err) {
|
2094
2116
|
llama_free(lctx);
|
2095
2117
|
llama_free_model(model);
|
2096
|
-
return
|
2118
|
+
return iparams;
|
2097
2119
|
}
|
2098
2120
|
}
|
2099
2121
|
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2122
|
+
// load and optionally apply lora adapters
|
2123
|
+
for (auto & la : params.lora_adapters) {
|
2124
|
+
llama_lora_adapter_container loaded_la;
|
2125
|
+
loaded_la.path = la.path;
|
2126
|
+
loaded_la.scale = la.scale;
|
2127
|
+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
2128
|
+
if (loaded_la.adapter == nullptr) {
|
2129
|
+
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
2106
2130
|
llama_free(lctx);
|
2107
2131
|
llama_free_model(model);
|
2108
|
-
return
|
2132
|
+
return iparams;
|
2109
2133
|
}
|
2110
|
-
|
2134
|
+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
2135
|
+
}
|
2136
|
+
if (!params.lora_init_without_apply) {
|
2137
|
+
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
2111
2138
|
}
|
2112
2139
|
|
2113
2140
|
if (params.ignore_eos) {
|
@@ -2135,22 +2162,35 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2135
2162
|
tmp.clear();
|
2136
2163
|
tmp.push_back(decoder_start_token_id);
|
2137
2164
|
}
|
2138
|
-
|
2165
|
+
if (llama_model_has_decoder(model)) {
|
2166
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
2167
|
+
}
|
2139
2168
|
llama_kv_cache_clear(lctx);
|
2140
2169
|
llama_synchronize(lctx);
|
2141
2170
|
llama_reset_timings(lctx);
|
2142
2171
|
}
|
2143
2172
|
|
2144
|
-
|
2173
|
+
iparams.model = model;
|
2174
|
+
iparams.context = lctx;
|
2175
|
+
return iparams;
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
|
2179
|
+
llama_lora_adapter_clear(ctx);
|
2180
|
+
for (auto & la : lora_adapters) {
|
2181
|
+
if (la.scale != 0.0f) {
|
2182
|
+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
2183
|
+
}
|
2184
|
+
}
|
2145
2185
|
}
|
2146
2186
|
|
2147
2187
|
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
2148
2188
|
auto mparams = llama_model_default_params();
|
2149
2189
|
|
2150
|
-
mparams.vocab_only = params.vocab_only;
|
2151
2190
|
if (params.n_gpu_layers != -1) {
|
2152
2191
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
2153
2192
|
}
|
2193
|
+
mparams.vocab_only = params.vocab_only;
|
2154
2194
|
mparams.rpc_servers = params.rpc_servers.c_str();
|
2155
2195
|
mparams.main_gpu = params.main_gpu;
|
2156
2196
|
mparams.split_mode = params.split_mode;
|
@@ -3167,19 +3207,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|
3167
3207
|
}
|
3168
3208
|
|
3169
3209
|
fprintf(stream, "lora:\n");
|
3170
|
-
for (
|
3171
|
-
if (
|
3172
|
-
|
3210
|
+
for (auto & la : params.lora_adapters) {
|
3211
|
+
if (la.scale == 1.0f) {
|
3212
|
+
fprintf(stream, " - %s\n", la.path.c_str());
|
3173
3213
|
}
|
3174
|
-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
3175
3214
|
}
|
3176
3215
|
fprintf(stream, "lora_scaled:\n");
|
3177
|
-
for (
|
3178
|
-
if (
|
3179
|
-
|
3216
|
+
for (auto & la : params.lora_adapters) {
|
3217
|
+
if (la.scale != 1.0f) {
|
3218
|
+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
3180
3219
|
}
|
3181
|
-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
3182
3220
|
}
|
3221
|
+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
3183
3222
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
3184
3223
|
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
3185
3224
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
package/cpp/common.h
CHANGED
@@ -33,6 +33,15 @@
|
|
33
33
|
|
34
34
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
35
35
|
|
36
|
+
struct llama_lora_adapter_info {
|
37
|
+
std::string path;
|
38
|
+
float scale;
|
39
|
+
};
|
40
|
+
|
41
|
+
struct llama_lora_adapter_container : llama_lora_adapter_info {
|
42
|
+
struct llama_lora_adapter * adapter;
|
43
|
+
};
|
44
|
+
|
36
45
|
// build info
|
37
46
|
extern int LLAMA_BUILD_NUMBER;
|
38
47
|
extern char const * LLAMA_COMMIT;
|
@@ -138,8 +147,8 @@ struct gpt_params {
|
|
138
147
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
139
148
|
std::vector<llama_model_kv_override> kv_overrides;
|
140
149
|
|
141
|
-
//
|
142
|
-
std::vector<
|
150
|
+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
151
|
+
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
143
152
|
|
144
153
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
145
154
|
|
@@ -289,6 +298,8 @@ std::vector<std::string> string_split(std::string input, char separator);
|
|
289
298
|
std::string string_strip(const std::string & str);
|
290
299
|
std::string string_get_sortable_timestamp();
|
291
300
|
|
301
|
+
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
302
|
+
|
292
303
|
template<class T>
|
293
304
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
294
305
|
std::vector<T> values;
|
@@ -320,8 +331,13 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
320
331
|
// Model utils
|
321
332
|
//
|
322
333
|
|
323
|
-
|
324
|
-
|
334
|
+
struct llama_init_result {
|
335
|
+
struct llama_model * model = nullptr;
|
336
|
+
struct llama_context * context = nullptr;
|
337
|
+
std::vector<llama_lora_adapter_container> lora_adapters;
|
338
|
+
};
|
339
|
+
|
340
|
+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
325
341
|
|
326
342
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
327
343
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
@@ -329,6 +345,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
329
345
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
330
346
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
331
347
|
|
348
|
+
// clear LoRA adapters from context, then apply new list of adapters
|
349
|
+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
|
350
|
+
|
332
351
|
// Batch utils
|
333
352
|
|
334
353
|
void llama_batch_clear(struct llama_batch & batch);
|
package/cpp/ggml-aarch64.c
CHANGED
@@ -16,6 +16,8 @@
|
|
16
16
|
|
17
17
|
#if defined(__GNUC__)
|
18
18
|
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
19
|
+
#elif defined(_MSC_VER)
|
20
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
19
21
|
#endif
|
20
22
|
|
21
23
|
#define UNUSED LM_GGML_UNUSED
|
@@ -384,8 +386,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
|
|
384
386
|
UNUSED(blocklen);
|
385
387
|
|
386
388
|
#if defined(__ARM_FEATURE_SVE)
|
387
|
-
if (
|
388
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
389
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
390
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
389
391
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
390
392
|
}
|
391
393
|
#endif
|
@@ -496,8 +498,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
496
498
|
UNUSED(blocklen);
|
497
499
|
|
498
500
|
#if defined(__ARM_FEATURE_SVE)
|
499
|
-
if (
|
500
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
501
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
502
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
501
503
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
502
504
|
}
|
503
505
|
#endif
|
@@ -614,7 +616,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
614
616
|
UNUSED(blocklen);
|
615
617
|
|
616
618
|
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
617
|
-
if (
|
619
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
618
620
|
const void * b_ptr = vx;
|
619
621
|
const void * a_ptr = vy;
|
620
622
|
float * res_ptr = s;
|
@@ -680,12 +682,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
680
682
|
return;
|
681
683
|
}
|
682
684
|
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
|
683
|
-
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (
|
685
|
+
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
684
686
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
685
687
|
"performance");
|
686
688
|
}
|
687
689
|
else if (lm_ggml_cpu_has_neon()) {
|
688
|
-
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (
|
690
|
+
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
|
689
691
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
690
692
|
"quantization format for optimal performance");
|
691
693
|
}
|
@@ -745,8 +747,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
|
|
745
747
|
UNUSED(blocklen);
|
746
748
|
|
747
749
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
748
|
-
if (
|
749
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
750
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
751
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
750
752
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
751
753
|
}
|
752
754
|
#endif
|
@@ -1266,8 +1268,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
1266
1268
|
UNUSED(blocklen);
|
1267
1269
|
|
1268
1270
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
1269
|
-
if (
|
1270
|
-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (
|
1271
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
1272
|
+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
1271
1273
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
1272
1274
|
}
|
1273
1275
|
#endif
|
@@ -1728,7 +1730,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
1728
1730
|
UNUSED(blocklen);
|
1729
1731
|
|
1730
1732
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
1731
|
-
if (
|
1733
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
1732
1734
|
const void * b_ptr = vx;
|
1733
1735
|
const void * a_ptr = vy;
|
1734
1736
|
float * res_ptr = s;
|
@@ -2139,12 +2141,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
|
|
2139
2141
|
return;
|
2140
2142
|
}
|
2141
2143
|
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
|
2142
|
-
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (
|
2144
|
+
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
|
2143
2145
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
2144
2146
|
"performance");
|
2145
2147
|
}
|
2146
2148
|
else if (lm_ggml_cpu_has_neon()) {
|
2147
|
-
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (
|
2149
|
+
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
|
2148
2150
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
2149
2151
|
"quantization format for optimal performance");
|
2150
2152
|
}
|
package/cpp/ggml-backend.c
CHANGED
@@ -351,15 +351,10 @@ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_ba
|
|
351
351
|
}
|
352
352
|
|
353
353
|
// an async copy would normally happen after all the queued operations on both backends are completed
|
354
|
-
//
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
} else {
|
359
|
-
lm_ggml_backend_synchronize(backend_src);
|
360
|
-
lm_ggml_backend_tensor_copy(src, dst);
|
361
|
-
lm_ggml_backend_synchronize(backend_dst);
|
362
|
-
}
|
354
|
+
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
355
|
+
lm_ggml_backend_synchronize(backend_src);
|
356
|
+
lm_ggml_backend_synchronize(backend_dst);
|
357
|
+
lm_ggml_backend_tensor_copy(src, dst);
|
363
358
|
}
|
364
359
|
|
365
360
|
// events
|
@@ -1782,7 +1777,17 @@ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_
|
|
1782
1777
|
} else {
|
1783
1778
|
lm_ggml_backend_synchronize(split_backend);
|
1784
1779
|
}
|
1785
|
-
|
1780
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
1781
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
1782
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
1783
|
+
lm_ggml_backend_synchronize(input_backend);
|
1784
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1785
|
+
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1786
|
+
} else {
|
1787
|
+
lm_ggml_backend_synchronize(split_backend);
|
1788
|
+
}
|
1789
|
+
lm_ggml_backend_tensor_copy(input, input_cpy);
|
1790
|
+
}
|
1786
1791
|
}
|
1787
1792
|
}
|
1788
1793
|
|
package/cpp/ggml-impl.h
CHANGED
@@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) {
|
|
80
80
|
/**
|
81
81
|
* Converts float32 to brain16.
|
82
82
|
*
|
83
|
-
* This
|
84
|
-
*
|
83
|
+
* This is binary identical with Google Brain float conversion.
|
84
|
+
* Floats shall round to nearest even, and NANs shall be quiet.
|
85
|
+
* Subnormals aren't flushed to zero, except perhaps when used.
|
85
86
|
* This code should vectorize nicely if using modern compilers.
|
86
87
|
*/
|
87
88
|
static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
|
@@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
|
|
95
96
|
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
96
97
|
return h;
|
97
98
|
}
|
98
|
-
if (!(u.i & 0x7f800000)) { /* subnormal */
|
99
|
-
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
100
|
-
return h;
|
101
|
-
}
|
102
99
|
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
103
100
|
return h;
|
104
101
|
}
|
@@ -146,6 +143,7 @@ extern "C" {
|
|
146
143
|
|
147
144
|
#if defined(__ARM_FEATURE_SVE)
|
148
145
|
#include <arm_sve.h>
|
146
|
+
#include <sys/prctl.h>
|
149
147
|
#endif
|
150
148
|
|
151
149
|
// 16-bit float
|
package/cpp/ggml-metal.h
CHANGED
@@ -50,6 +50,8 @@ LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_f
|
|
50
50
|
|
51
51
|
LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb);
|
52
52
|
|
53
|
+
LM_GGML_API void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data);
|
54
|
+
|
53
55
|
LM_GGML_API LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
|
54
56
|
|
55
57
|
// helper to check if the device supports a specific family
|